SLUB内存管理之slub初始化

Realdagongzai

已于 2022-06-19 15:59:15 修改

阅读量517

点赞数

分类专栏： linux内存管理文章标签： linux kernel 内存管理

于 2022-06-19 15:58:07 首次发布

本文链接：https://blog.csdn.net/weixin_45337360/article/details/125208818

版权

linux内存管理专栏收录该内容

15 篇文章 23 订阅

订阅专栏

在讲slub内存管理涉及的四个函数之前，先从slub内存分配算法的初始化开始。系统启动时，会进行slub内存分配算法的初始化，函数流程是：start_kernel() -> mm_init()->kmem_cache_init()。在start_kernel()函数中的setup_arch()里面会利用bootmem分配器进行启动阶段早期的内存分配，然后调用paging_init() -> bootmem_init()进行分页机制和内存管理的初始化。有了前面的基础，在mm_init()里面，首先，调用mem_init()初始化buddy system内存管理算法，至此bootmem分配器完成了任务；后面kmem_cache_init()在buddy system的基础上，进行slub内存分配的初始化。完成 kmem_cache_node 、 kmem_cache 、 kmalloc_caches 三个slab cache管理结构的初始化，对于内核中其他的slab cache的创建通过kmem_cache_create()函数完成。

一、函数详细调用关系图

二、kmem_cache_init函数代码流程

主要完成三个工作：创建 kmem_cache_node 、 kmem_cache 和 kmalloc_caches 三个slab cache

static struct kmem_cache *kmem_cache_node; //linux-4.19.49/mm/slub.c
struct kmem_cache *kmem_cache; //linux-4.19.49/mm/slab_common.c 

/* Align objs on cache lines */
#define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000U)

/*mm/slab.h
 * State of the slab allocator.
 *
 * This is used to describe the states of the allocator during bootup.
 * Allocators use this to gradually bootstrap themselves. Most allocators
 * have the problem that the structures used for managing slab caches are
 * allocated from slab caches themselves.
 */
enum slab_state {
	DOWN,			/* No slab functionality yet */
	PARTIAL,		/* SLUB: kmem_cache_node available */
	PARTIAL_NODE,		/* SLAB: kmalloc size for node struct available */
	UP,			/* Slab caches usable but not all extras yet */
	FULL			/* Everything is working */
};

//主要完成三个工作：创建 kmem_cache_node 、 kmem_cache 和 kmalloc_caches 三个slab cache
void __init kmem_cache_init(void)
{
	static __initdata struct kmem_cache boot_kmem_cache,
		boot_kmem_cache_node; //声明静态变量，存储临时kmem_cache结构；

	if (debug_guardpage_minorder())
		slub_max_order = 0;
//这两个全局变量定义的位置在上面
	kmem_cache_node = &boot_kmem_cache_node;
	kmem_cache = &boot_kmem_cache;

//1、创建kmem_cache_node对象的管理结构
	create_boot_cache(kmem_cache_node, "kmem_cache_node",
		sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);

//注册热插拔内存内核通知链回调函数用于热插拔内存处理，注册到memory_chain上 
	register_hotmemory_notifier(&slab_memory_callback_nb);

//因为前面创建了创建kmem_cache_node，意味着kmem_cache_node available，所以slab_state状态改为partial，
//slab_state开始默认是DOWN，表示slab完全不可用
	/* Able to allocate the per node structures */
	slab_state = PARTIAL;

/*
2、创建kmem_cache对象的管理结构,这里相比前面，主要是kmem_cache的size会有变化，因为包含了kmem_cache_node
offsetof(struct kmem_cache, node) 得到node成员变量的在struct kmem_cache结构体偏移量，
然后最终得到整个struct kmem_cache结构的size，进行完整的kmem_cache的创建（前面kmem_cache_node
已经进行了部分创建）
*/
	create_boot_cache(kmem_cache, "kmem_cache",
			offsetof(struct kmem_cache, node) +
				nr_node_ids * sizeof(struct kmem_cache_node *),
		       SLAB_HWCACHE_ALIGN, 0, 0);

//3、申请保存管理结构的内存，将临时kmem_cache和boot_kmem_cache_node分别向最终kmem_cache和kmem_cache_node管理结构内存迁移，
//并修正上面node的page（slab）指针，使其指向最终的kmem_cache和kmem_cache_node
	kmem_cache = bootstrap(&boot_kmem_cache);
	kmem_cache_node = bootstrap(&boot_kmem_cache_node);

	/* Now we can use the kmem_cache to allocate kmalloc slabs */
//4、根据KMALLOC_MIN_SIZE，更新kmalloc时要用到的size_index table，kmalloc会在slub allocator工作原理里详细描述
	setup_kmalloc_cache_index_table();
//5、初始化kmem_caches，里面包含各种kmalloc-x
	create_kmalloc_caches(0);

	/* Setup random freelists for each cache */
//遍历全局链表slab_caches上面的kmem_cache，将随机化的值放到结构体里面random_seq数组里面，
//后面freelist指针的随机化时会用到（出于安全考虑）
	init_freelist_randomization();

//设置好CPU进行热插拔时slub的回调函数slub_cpu_dead，以便在CPU热插拔时做相应处理
	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
				  slub_cpu_dead);
//打印slub管理结构初始化后，当前slub的硬件对齐大小，阶数，单个slab里面最小obj数目，逻辑cpu数目，节点数目
	pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n",
		cache_line_size(),//返回L1 cache line的大小
		slub_min_order, slub_max_order, slub_min_objects,
		nr_cpu_ids, nr_node_ids);
}

2.1 create_boot_cache函数

在boot阶段时创建slab cache管理结构，包含： kmem_cache_node，kmem_cache，kmalloc_caches

/* Create a cache during boot when no slab services are available yet */
void __init create_boot_cache(struct kmem_cache *s, const char *name,
		unsigned int size, slab_flags_t flags,
		unsigned int useroffset, unsigned int usersize)
{
	int err;
//进行参数的初始化，calculate_alignment计算内存对齐值，具体在SLUB内存管理的4个主要接口函数介绍（1）中有讲
	s->name = name;
	s->size = s->object_size = size;
	s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
	s->useroffset = useroffset;
	s->usersize = usersize;

	slab_init_memcg_params(s);
//核心函数，这个在SLUB内存管理的4个主要接口函数介绍（1）中也有讲，正常是返回0
/*
__kmem_cache_crete 主要通过 kmem_cache_open 实现，这个函数除了设置kmem cache的一些参数以外，
还会调用 init_kmem_cache_nodes 和 alloc_kmem_cache_cpus;
前者用于初始化kmem_cache中的 struct kmem_cache_node *node[MAX_NUMNODES] 成员,
后者用于分配 struct kmem_cache 中的per-cpu成员变量 struct kmem_cache_cpu __percpu *cpu_slab 。
*/
	err = __kmem_cache_create(s, flags);

	if (err)
		panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
					name, size, err);
//初始化keme cache引用计数为-1
	s->refcount = -1;	/* Exempt from merging for now */
}

2.2 bootstrap函数

在系统启动阶段，前期的管理很多都是借用临时变量空间的，所以将会通过bootstrap()将kmem_cache_node和kmem_cache的管理结构迁入到slub管理框架的对象空间中，实现自管理

/* 
* 将boot_kmem_cache和boot_kmem_cache_node中的内容拷贝到新申请的slub对象中,并更新每个node上每个page（slab）指向kmem_cache的指针
* 从而完成了struct kmem_cache和struct kmem_cache_node管理结构的bootstrap(自引导)
*/
/*
 * Used for early kmem_cache structures that were allocated using
 * the page allocator. Allocate them properly then fix up the pointers
 * that may be pointing to the wrong kmem_cache structure.
 */
static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
{
	int node;

//为前面create_boot_cache()创建的kmem_cache申请slub空间，值得注意的是该函数申请调用kmem_cache_zalloc()->kmem_cache_alloc()->slab_alloc(),
//kmem_cache_alloc函数在SLUB内存管理的4个主要接口函数介绍（2）会详细介绍
	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
	struct kmem_cache_node *n;

//将bootstrap()入参的kmem_cache结构数据memcpy()至申请的空间中
	memcpy(s, static_cache, kmem_cache->object_size);

	/*
	 * This runs very early, and only the boot processor is supposed to be
	 * up.  Even if it weren't true, IRQs are not up so we couldn't fire
	 * IPIs around.
	 */
//刷新cpu的slab信息,主要更新c->page，c->freelist和c->partial
	__flush_cpu_slab(s, smp_processor_id());

//循环遍历s->node数组中的所有node，然后遍历每个node上面的partial链表，
//修改上面的struct page的slab_cache指针，指向当前的kmem_cache
	for_each_kmem_cache_node(s, node, n) {
		struct page *p;

		list_for_each_entry(p, &n->partial, lru)
			p->slab_cache = s;
//如果使能了slub debug,每个node上面会多一个full链表，指针也要进行更新
#ifdef CONFIG_SLUB_DEBUG
		list_for_each_entry(p, &n->full, lru)
			p->slab_cache = s;
#endif
	}
//初始化s->memcg_params里面的部分参数
	slab_init_memcg_params(s);

//将kmem_cache添加到全局slab_caches链表中
	list_add(&s->list, &slab_caches);

//根据判断s是不是root_cache，继续初始化s->memcg_params里面的参数
	memcg_link_cache(s);
	return s;
}

void slab_init_memcg_params(struct kmem_cache *s)
{
	s->memcg_params.root_cache = NULL;
	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
	INIT_LIST_HEAD(&s->memcg_params.children);
	s->memcg_params.dying = false;
}

2.3 setup_kmalloc_cache_index_table函数

根据KMALLOC_MIN_SIZE（默认是3），更新创建kmalloc时要用到的size_index table，默认情况是不会执行，使用已经定义好的size_index table即可

/*
 * Conversion table for small slabs sizes / 8 to the index in the
 * kmalloc array. This is necessary for slabs < 192 since we have non power
 * of two cache sizes there. The size of larger slabs can be determined using
 * fls.
 */
static u8 size_index[24] __ro_after_init = {
	3,	/* 8 */
	4,	/* 16 */
	5,	/* 24 */
	5,	/* 32 */
	6,	/* 40 */
	6,	/* 48 */
	6,	/* 56 */
	6,	/* 64 */
	1,	/* 72 */
	1,	/* 80 */
	1,	/* 88 */
	1,	/* 96 */
	7,	/* 104 */
	7,	/* 112 */
	7,	/* 120 */
	7,	/* 128 */
	2,	/* 136 */
	2,	/* 144 */
	2,	/* 152 */
	2,	/* 160 */
	2,	/* 168 */
	2,	/* 176 */
	2,	/* 184 */
	2	/* 192 */
};

static inline unsigned int size_index_elem(unsigned int bytes)
{
	return (bytes - 1) / 8;
}

//linux/slab.h
#ifdef CONFIG_SLUB
/*
 * SLUB directly allocates requests fitting in to an order-1 page
 * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
 */
#define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1) //PAGE_SHIFT=12
#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT - 1) //MAX_ORDER=11
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW	3
#endif
#endif
/* Maximum allocatable size */
#define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_MAX)
/* Maximum size for which we actually use a slab cache */
#define KMALLOC_MAX_CACHE_SIZE	(1UL << KMALLOC_SHIFT_HIGH)
/* Maximum order allocatable via the slab allocagtor */
#define KMALLOC_MAX_ORDER	(KMALLOC_SHIFT_MAX - PAGE_SHIFT)

/*
 * Kmalloc subsystem.
 */
#ifndef KMALLOC_MIN_SIZE
//根据这个默认值为8，所以setup_kmalloc_cache_index_table函数里面的基本不会对size_index数组里面的值进行修改
#define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)  
#endif

/*
 * Patch up the size_index table if we have strange large alignment
 * requirements for the kmalloc array. This is only the case for
 * MIPS it seems. The standard arches will not generate any code here.
 *
 * Largest permitted alignment is 256 bytes due to the way we
 * handle the index determination for the smaller caches.
 *
 * Make sure that nothing crazy happens if someone starts tinkering
 * around with ARCH_KMALLOC_MINALIGN，the value is ARCH_DMA_MINALIGN 
 * if define ARCH_DMA_MINALIGN && ARCH_DMA_MINALIGN > 8, and 
 * ARCH_DMA_MINALIGN generally euqal to L1_cache_bytes;
 * otherwise ARCH_KMALLOC_MINALIGN is __alignof__(unsigned long long)
 */

//条件为真，则编译时会报错，因为char[1-2],是有问题的
#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))

void __init setup_kmalloc_cache_index_table(void)
{
	unsigned int i;
//如果KMALLOC_MIN_SIZE大于256，或者KMALLOC_MIN_SIZE大小不是2的幂次方，此时condition为真，那么就会在编译时报错
	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
		(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
//对大小在8byte与KMALLOC_MIN_SIZE之间的对象，将其在size_index数组的索引设置为KMALLOC_SHIFT_LOW
	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
//得到size_index数组中对应的索引
		unsigned int elem = size_index_elem(i);
//如果索引超过数组size_index的大小，直接break
		if (elem >= ARRAY_SIZE(size_index))
			break;
//这个数组元素开始已经定义好的，但在系统启动阶段会可能更新一次size_index数组，后面不会再更新，只能读
		size_index[elem] = KMALLOC_SHIFT_LOW;//如前面宏定义，KMALLOC_SHIFT_LOW为3
	}

//下面两个if语句也是根据KMALLOC_MIN_SIZE值，判断在系统启动阶段是否需要更新size_index数组
//对64byte至96byte及128byte至192byte之间的对象，对其在size_index数组的index值进行设置
	if (KMALLOC_MIN_SIZE >= 64) {
		/*
		 * The 96 byte size cache is not used if the alignment
		 * is 64 byte.
		 */
		for (i = 64 + 8; i <= 96; i += 8)
			size_index[size_index_elem(i)] = 7;

	}

	if (KMALLOC_MIN_SIZE >= 128) {
		/*
		 * The 192 byte sized cache is not used if the alignment
		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
		 * instead.
		 */
		for (i = 128 + 8; i <= 192; i += 8)
			size_index[size_index_elem(i)] = 8;
	}
}

2.4 create_kmalloc_caches函数

初始化kmem_caches，里面包含各种kmalloc-x，后面kmalloc会用到这个kmem_caches，在slub allocator工作原理这篇文章里面有介绍kmalloc函数

struct kmem_cache *
kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
EXPORT_SYMBOL(kmalloc_caches);

/*
 * Whenever changing this, take care of that kmalloc_type() and
 * create_kmalloc_caches() still work as intended.
 */
enum kmalloc_cache_type {
	KMALLOC_NORMAL = 0,
	KMALLOC_RECLAIM,
#ifdef CONFIG_ZONE_DMA
	KMALLOC_DMA,
#endif
	NR_KMALLOC_TYPES
};

/* A table of kmalloc cache names and sizes */
extern const struct kmalloc_info_struct {
	const char *name;
	unsigned int size;
} kmalloc_info[];
/*
 * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
 * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
 * kmalloc-67108864.
 */
const struct kmalloc_info_struct kmalloc_info[] __initconst = { //__initconst 用于初始化数据
	{NULL,                      0},		{"kmalloc-96",             96},
	{"kmalloc-192",           192},		{"kmalloc-8",               8},
	{"kmalloc-16",             16},		{"kmalloc-32",             32},
	{"kmalloc-64",             64},		{"kmalloc-128",           128},
	{"kmalloc-256",           256},		{"kmalloc-512",           512},
	{"kmalloc-1k",           1024},		{"kmalloc-2k",           2048},
	{"kmalloc-4k",           4096},		{"kmalloc-8k",           8192},
	{"kmalloc-16k",         16384},		{"kmalloc-32k",         32768},
	{"kmalloc-64k",         65536},		{"kmalloc-128k",       131072},
	{"kmalloc-256k",       262144},		{"kmalloc-512k",       524288},
	{"kmalloc-1M",        1048576},		{"kmalloc-2M",        2097152},
	{"kmalloc-4M",        4194304},		{"kmalloc-8M",        8388608},
	{"kmalloc-16M",      16777216},		{"kmalloc-32M",      33554432},
	{"kmalloc-64M",      67108864}
};
/*
 * Create the kmalloc array. Some of the regular kmalloc arrays
 * may already have been created because they were needed to
 * enable allocations for slab creation.
 */
void __init create_kmalloc_caches(slab_flags_t flags)
{
	int i, type;

	for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
//KMALLOC_SHIFT_HIGH默认为13，看kmalloc_info可知默认通过kmalloc分配的最大内存是8K=2*page
		for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
			if (!kmalloc_caches[type][i])
//5.1循环调用new_kmalloc_cache来初始化kmalloc_caches数组
				new_kmalloc_cache(i, type, flags);

			/*
			 * Caches that are not of the two-to-the-power-of size.
			 * These have to be created immediately after the
			 * earlier power of two caches
			 */
/*
原则上系统会为每个2次幂大小的内存块申请一个缓存，
但是内存块过小时，会产生很多碎片浪费，所以系统为96B和192B也各自创建了一个缓存。
大小为64~96B和128B~192B，单独创建了两个kmem_cache保存在kmalloc_caches [1]和kmalloc_caches [2]
*/
			if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
					!kmalloc_caches[type][1])
				new_kmalloc_cache(1, type, flags);
			if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
					!kmalloc_caches[type][2])
				new_kmalloc_cache(2, type, flags);
		}
	}
//初始化完kmalloc_caches(在初始化kmalloc_caches时，相当于也是把对应size的kmem_cache也给初始化了),
//此时slab_state状态从partial变成UP，Slab 缓存可用，基本功能已经有了，但还不是所有功能都可以用了
	/* Kmalloc array is now usable */
	slab_state = UP;

#ifdef CONFIG_ZONE_DMA
//如果定义了CONFIG_ZONE_DMA，那么对应的kmalloc_caches[KMALLOC_DMA][i]要进行初始化
	for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
//首先获取kmalloc_caches[KMALLOC_NORMAL][i]，基本结构直接复用
		struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];

		if (s) {
//得到此时i对应的size，实际上跟kmalloc_info里面的size对应
			unsigned int size = kmalloc_size(i);
//kmalloc cache name 为"dma-kmalloc-x"
			const char *n = kmalloc_cache_name("dma-kmalloc", size);

			BUG_ON(!n);
//最后调用create_kmalloc_cache完成DMA对应的kmem_cache和kmalloc_caches的初始化
			kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
				n, size, SLAB_CACHE_DMA | flags, 0, 0);
		}
	}
#endif
}

//5.1
static void __init
new_kmalloc_cache(int idx, int type, slab_flags_t flags)
{
	const char *name;
//5.1.1 type为KMALLOC_RECLAIM，走这里得到kmalloc cache name，形如"kmalloc-rcl-X"
	if (type == KMALLOC_RECLAIM) {
		flags |= SLAB_RECLAIM_ACCOUNT;
		name = kmalloc_cache_name("kmalloc-rcl",
						kmalloc_info[idx].size);
		BUG_ON(!name);
	} else {
//type为KMALLOC_NORMAL，直接从kmalloc_info根据idx,得到kmalloc cache name，形如"kmalloc-X"
		name = kmalloc_info[idx].name;
	}

//5.1.2 这里开始初始化kmalloc_caches数组
	kmalloc_caches[type][idx] = create_kmalloc_cache(name,
					kmalloc_info[idx].size, flags, 0,
					kmalloc_info[idx].size);

}

//5.1.1
static const char *
kmalloc_cache_name(const char *prefix, unsigned int size)
{
//对于type为KMALLOC_RECLAIM，输出的name，前缀是kmalloc-rcl，
//同时根据size大小，进行k和M的转换，如size为8，则为"kmalloc-rcl-8";
//1024为"kmalloc-rcl-1k";因为idx默认最大为13，导致size最大为8k,不会出现M
	static const char units[3] = "\0kM";
	int idx = 0;

	while (size >= 1024 && (size % 1024 == 0)) {
		size /= 1024;
		idx++;
	}

	return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]);
}

//5.1.2 开始真正初始化kmalloc_caches数组的核心函数
struct kmem_cache *__init create_kmalloc_cache(const char *name,
		unsigned int size, slab_flags_t flags,
		unsigned int useroffset, unsigned int usersize)
{
//通过kmem_cache_zalloc()->kmem_cache_alloc()路径申请一个obj大小为0的kmem_cache
	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);

//如果内存不够，s为null，则panic
	if (!s)
		panic("Out of memory when creating slab %s\n", name);

//最后调用前面描述的create_boot_cache，size和usersize都是obj size,完整真正意义上的slab初始化
//前面创建kmem_cache和kmem_cache_node管理结构时，size就是结构体的大小，usersize为0
	create_boot_cache(s, name, size, flags, useroffset, usersize);

//同时还会将这个kmem_cache添加到全局链表slab_caches
	list_add(&s->list, &slab_caches);
	memcg_link_cache(s);

//这个kmem_cache引用计数从-1改为1
	s->refcount = 1;
//返回，添加到对应的kmalloc_caches数组中，后面kmalloc就会用到这个kmalloc_caches数组
	return s;
}

/*
 * Shortcuts
 */
static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
{
	return kmem_cache_alloc(k, flags | __GFP_ZERO);
}

/*
 * Determine size used for the nth kmalloc cache.
 * return size or 0 if a kmalloc cache for that
 * size does not exist
 */
static __always_inline unsigned int kmalloc_size(unsigned int n)
{
#ifndef CONFIG_SLOB
//进行一个简单的判断，返回对应的size
	if (n > 2)
		return 1U << n;
//对应大小为64~96B和128B~192B，系统为96B和192B也各自创建了一个缓存
	if (n == 1 && KMALLOC_MIN_SIZE <= 32)
		return 96;

	if (n == 2 && KMALLOC_MIN_SIZE <= 64)
		return 192;
#endif
	return 0;
}