内存管理器（十五）kernel内存管理---slab设计与实现2（初始化与创建）

最新推荐文章于 2022-11-09 17:26:51 发布

zmrlinux

最新推荐文章于 2022-11-09 17:26:51 发布

阅读量599

点赞数

本文链接：https://blog.csdn.net/zmrlinux/article/details/49784003

版权

内存管理器（十五）kernel内存管理---slab设计与实现2

前言

上一篇博文主要写了slab的设计与主要的数据结构，但是依然有很多的问题没有解决，在这里先标记下。

1.着色这个动作到底干了什么?是否只是为了减少内存碎片？

2.slab 结构体并没有再内核源码中定义，它是如何实现对不同类型对象的分配的？

带着这两个问题我们进入slab 实现的探索

slab 实现

首先一个应用实例

本来想要先从初始化开始，但是又觉得直接开始分析实现不是很直观，不如先学会如何使用然后再回头看看如何实现，这也是一个正常且正确的认知过程。

还是使用我们最常用的工具内核模块

我们首先在自己的内核开发树中建立一个自己的例字结构体

[c]

#ifndef _TEST_H
#define _TEST_H
typedef struct TEST{
int num;
char *name;
}TEST;

#endif

[/c]

[c]

开始看我们的内核模块

#include<linux/module.h>
#include<linux/slab.h>
#include<linux/mm.h>
#include<linux/gfp.h>
#include<linux/mm_types.h>
#include<linux/list.h>
#include<linux/types.h>
#include<linux/test.h>

MODULE_LICENSE("GPL");
MODULE_AUTHOR("zmrlinux");
MODULE_DESCRIPTION("something about slab_create");
static struct kmem_cache *test;

static int __init slab_create(void){
struct TEST *pos; /*三个示例*/
struct TEST *temp;
struct TEST *p;
printk("\n\n");
test = kmem_cache_create("TEST",sizeof(struct TEST),8,SLAB_RED_ZONE,((void *) 0));

/*kmem_cache_create 这个函数用来创建我们这个结构体的高速缓存节点*/
if(!test){
printk("create failed ");
return -1;
}else{
printk("I create a kmem_cache is ok ,i feel good\n");
}
printk("create a test slab of test\n");
pos = kmem_cache_alloc(test,GFP_KERNEL);

/*从我们的高速缓存区中分配一个结构体*/
if(NULL == pos){
printk("alloc new slab is failed\n");
return -1;
}else{
printk("create pos is ok\n");
printk("pos :%p\n",pos);
}
temp = kmem_cache_alloc(test,GFP_KERNEL);
if(NULL == temp){
printk("alloc new slab is failed\n");
return -1;
}else{
printk("create temp is ok \n");
printk("temp :%p\n",temp);
}
p = kmem_cache_alloc(test,GFP_KERNEL);
if(NULL == p){
printk("alloc new slab is failed\n");
return -1;
}else{
printk("create temp is ok\n");
printk("p :%p\n",p);
}
printk("betewn size is %ld\n",(temp-pos));
/*打印两个节点的距离*/

kmem_cache_free(test,pos);
kmem_cache_free(test,temp);

return 0;
}

static void __exit slab_out(void){

printk("BYBY :)\n");

}

module_init(slab_create);
module_exit(slab_out);

[/c]

效果如下：

可以看到我们已经成功的创建了高速缓存节点，并且从中获得了节点实例。

下来我们看看实现

_init 初始化

这里首先说一个问题，在初始化这个slab 的时候，内核需要若干小与一个整页的内存块，这些最适合kmalloc分配但是，只有在slab系统启动后才能启动kmalloc 所以，这里需要一些技巧，我们接着网下看。

[c]

/*
* Initialisation. Called after the page allocator have been initialised and
* before smp_init().
*/

/*在伙伴系统启动后，多个CPU启动之前开始初始化，所以到这里启动的时候只有一个启动CPU能工作，其他处理器还没启动，明确它的启动时机之后我们就可以看下一步了*/
void __init kmem_cache_init(void)
{
int i;

BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
sizeof(struct rcu_head));
kmem_cache = &kmem_cache_boot; /*第一个高速缓存开始建立*/

if (num_possible_nodes() == 1) /*如果可能使用的节点为0，使用了位图*/
use_alien_caches = 0; /*可用的节点为0*/
for (i = 0; i < NUM_INIT_LISTS; i++)
kmem_cache_node_init(&init_kmem_cache_node[i]);
/*这里的循环主要是对于NUMA体系架构的，循环初始化每个CPU的私有高速缓存，但是对于SMP架构的暂时不说*/
/*
* Fragmentation (碎片） resistance(阻力） on low（低） memory - only use bigger
* page orders on(命令） machines with more than 32MB of memory if
* not overridden on the command line.
*/ /*确定最大的slab大小*/
if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
slab_max_order = SLAB_MAX_ORDER_HI;

/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:引导时需要技巧的，尤其是对象还没有创建
* 1) initialize the kmem_cache cache: it contains(包含） the struct
* kmem_cache structures of all caches, except kmem_cache itself:
* kmem_cache is statically allocated. 首先创建的高速缓存是静态的
* Initially an __init data area is used for the head array and the
* kmem_cache_node structures, it's replaced with a kmalloc allocated
* array at the end of the bootstrap. 最初的时候创建slab的头部数组就是kmem_cache_no*de 结构体，这个数组将会被kmalloc分配的数组所替换，在自举的最后阶段
* 2) Create the first kmalloc cache. 创建第一个kmalloc的存储空间，这个时候kmalloc
* 还不能适用，这里首先期望kmalloc 可以被使用。
* The struct kmem_cache for the new cache is allocated normally.这个高速缓存是普* 通创建的。
* An __init data area is used for the head array.(be used for用来做，最初的数据
* 域被用来做头数组
* 3) Create the remaining kmalloc caches, with minimally sized
* head arrays. 使用静态数组的剩下的空间创建剩下的缓存
* 4) Replace the __init data head arrays for kmem_cache and the first
* kmalloc cache with kmalloc allocated arrays.将头部数组替换成kmem_cache 控制结构* 体，将第一次为kmalloc 分配的cache 替换为kmalloc 分配的数组。
* 5) Replace the __init data for kmem_cache_node for kmem_cache and
* the other cache's with kmalloc allocated memory.将最初的数据逐步替换为kmem_cach* e
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/ /*逐步替换*/ 调整头数组的最终大小

/* 1) create the kmem_cache */

/*
* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
*/

/*创建第一个高速缓存实例，很多东西都是静态写好的，他的大小却绝于CPU等信息*/
create_boot_cache(kmem_cache, "kmem_cache",
offsetof(struct kmem_cache, node) +
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN);
/*创建kmem_cache 最终调用了__kmem_cache_create()函数，这个函数在下边有介绍*/
list_add(&kmem_cache->list, &slab_caches);/*将kmem_cache 加入kmem_caches链表*/
slab_state = PARTIAL;
/*将缓存创建后，设置缓存的出状态*/
/*
* Initialize the caches that provide memory for the kmem_cache_node
* structures first. Without this, further allocations will bug.
*/ /*为高速缓存节点创建提供内存而初始化一部分缓存*/
kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
slab_state = PARTIAL_NODE;

/*修改标记*/

slab_early_init = 0;

/* 5) Replace the bootstrap kmem_cache_node */
{ /*替换启动时的kmem_cache_node*/
int nid;

for_each_online_node(nid) {
init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);

init_list(kmalloc_caches[INDEX_NODE],
&init_kmem_cache_node[SIZE_NODE + nid], nid);
}
}

create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
}

[/c]

[c]

void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep; /*高速缓存节点*/

slab_state = UP;

/* 6) resize the head arrays to their final sizes */
mutex_lock(&slab_mutex); /*获取锁，遍历一边寻找是否初始化过这个节点，如果有直接BUG*/
list_for_each_entry(cachep, &slab_caches, list)
if (enable_cpucache(cachep, GFP_NOWAIT))
BUG();
mutex_unlock(&slab_mutex); /*解锁*/

/* Done! */
slab_state = FULL; /*初始化完成*/

/*
* Register a cpu startup notifier callback that initializes
* cpu_cache_get for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);

#ifdef CONFIG_NUMA
/*
* Register a memory hotplug callback that initializes and frees
* node.
*/
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
#endif

/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
*/
}

static int __init cpucache_init(void) /*初始化CPU的缓存，将不需要的页返回*/
{
int cpu;

/*
* Register the timers that return unneeded pages to the page allocator
*/
for_each_online_cpu(cpu)
start_cpu_timer(cpu);

/* Done! */
slab_state = FULL;
return 0;

[/c]

创建一个高速缓存，就是我们上边用的kmem_cache_create

[c]

/*
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
* @align: The required alignment for the objects.
* @flags: SLAB flags
* @ctor: A constructor for the objects.
*以上是参数列表，分别是名称，大小，对齐，SLAB 标识，构造函数（现在不太用了）
* Returns a ptr to the cache on success, NULL on failure.
* Cannot beint called within a inerrupt, but can be interrupted.
* The @ctor is run when new pages are allocated by the cache.
*
* The flags are
*
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
* to catch references to uninitialised memory.中毒标记
*
* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
* for buffer overruns. 红色警戒区，防止越界，即使在越界的时候也能发现
*
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.对齐
*/
struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s; /*分配目标变量*/
const char *cache_name; /*需要分配缓存区的名字*/
int err;

get_online_cpus(); /*关掉内核抢占，防并发访问,并且给CPU引用计数+1*/
get_online_mems();
memcg_get_cache_ids(); /*获得内存节点ID*/

mutex_lock(&slab_mutex);/*获得slab的锁*/

err = kmem_cache_sanity_check(name, size); /*name 冲突检查*/
if (err) {
s = NULL; /* suppress uninit var warning */
goto out_unlock;
}

/*
* Some allocators will constraint（约束） the set of valid（有效） flags to a subset（子集）
* of all flags. We expect them to define CACHE_CREATE_MASK in this
* case, and we'll just provide them with a sanitized （干净）version（版本） of the
* passed flags.
*/
flags &= CACHE_CREATE_MASK; /*设置过干净的基本的标志后开始进行分配操作了*/

s = __kmem_cache_alias(name, size, align, flags, ctor);/*第一次分配请看下边详细分解*/
if (s){ /*分配成功直接返回地址，并且处理一些后序工作*/
goto out_unlock;
}
cache_name = kstrdup_const(name, GFP_KERNEL); /*寻找一块空间保存这个名字的字符串*/
if (!cache_name) { /*如果名字保存失败，就退出,这里注意一个函数__do_kmalloc(分配内存)可以看出这里已经可以使用kmalloc 来分配内存了*/
err = -ENOMEM;
goto out_unlock;
}

s = do_kmem_cache_create(cache_name, size, size,
calculate_alignment(flags, align, size),
flags, ctor, NULL, NULL); /*创建一个高速缓存,并且初始化信息*/
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
}

out_unlock:
mutex_unlock(&slab_mutex); /*解锁高速缓存链表锁*/
memcg_put_cache_ids(); /*释放之前持有的资源*/
put_online_mems();
put_online_cpus();

if (err) {
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
else {
printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
name, err);
dump_stack();
}
return NULL;
}
return s;
}

[/c]

__kmem_cache_alias 第一次详细分配

[c]

struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *cachep;

cachep = find_mergeable(size, align, flags, name, ctor); /*尝试在高速缓存的链表中寻找一个给出来，这个函数的详细过程看下边*/
if (cachep) {/*如果分配好了引用计数+1，并且调整最后的大小*/
cachep->refcount++;

/*
* Adjust the object sizes so that we clear
* the complete object on kzalloc.
*/
cachep->object_size = max_t(int, cachep->object_size, size);
}
return cachep;
}

[/c]

[c]

/*这里进行参数处理，并且准备从已经存在的缓存中找出东西*/

struct kmem_cache *find_mergeable(size_t size, size_t align,
unsigned long flags, const char *name, void (*ctor)(void *))
{
struct kmem_cache *s;

if (slab_nomerge || (flags & SLAB_NEVER_MERGE))
return NULL;

if (ctor)
return NULL;

size = ALIGN(size, sizeof(void *)); /*对齐算出我们需要的大小,宏用法同下*/
align = calculate_alignment(flags, align, size);/*对齐，虽然有默认的对齐值，内核会一直将对齐值处以2，井可能多的将对象放入行缓存中*/
size = ALIGN(size, align); /*我只想说这个宏有毒，（x,(typeof(x))(a)-1) ==> (((x) + (mask)) & ~(mask)),我是被这个宏的用法惊呆了*/

flags = kmem_cache_flags(size, flags, name, NULL);
/*尝试从slab_caches中获取一个可以使用的kmem_cache*/
list_for_each_entry_reverse(s, &slab_caches, list) {
if (slab_unmergeable(s)) /*寻找一个可以合并的slab cache*/
continue;

if (size > s->size) /*其大小应当大与等与我们需要的大小*/
continue;

if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
continue; /*标识相同*/
/*
* Check if alignment is compatible.
* Courtesy of Adrian Drzewiecki
*/
if ((s->size & ~(align - 1)) != s->size)
continue; /*必须是对齐的*/

if (s->size - size >= sizeof(void *))
continue; /*两个大小的差值必须在体系结构之内*/

if (IS_ENABLED(CONFIG_SLAB) && align &&
(align > s->align || s->align % align))
continue;

return s;
}
return NULL;
}

[/c]

***__kmem_cache_create(s,flags)***
看这个函数，在初始化时候的调用的函数,在调用这个函数之前其实还调用了create_boot_cache(),下边我删除一些DEBUG的东西。便于看清结构。

[c]
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{ /*首先对齐到处理器字长的步数*/
size_t left_over, freelist_size;
size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
size_t size = cachep->size;

/*
* Check that size is in terms of words. This is needed to avoid
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/
if (size & (BYTES_PER_WORD - 1)) {
size += (BYTES_PER_WORD - 1);
size &= ~(BYTES_PER_WORD - 1);
}

if (flags & SLAB_RED_ZONE) {
ralign = REDZONE_ALIGN;
/* If redzoning, ensure that the second redzone is suitably
* aligned, by adjusting the object size accordingly. */
size += REDZONE_ALIGN - 1;
size &= ~(REDZONE_ALIGN - 1);
}

/* 3) caller mandated alignment */
if (ralign < cachep->align) {
ralign = cachep->align;
}
/* disable debug if necessary */
if (ralign > __alignof__(unsigned long long))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it. /*计算出对齐值并保存*/
*/
cachep->align = ralign;

if (slab_is_available())
gfp = GFP_KERNEL;
else
gfp = GFP_NOWAIT;

/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
* it too early on. Always use on-slab management when
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
!(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/
flags |= CFLGS_OFF_SLAB;

size = ALIGN(size, cachep->align);
/*
* We should restrict the number of objects in a slab to implement
* byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
*/
if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);

left_over = calculate_slab_order(cachep, size, cachep->align, flags);
/*计算出理想的slab长度，太小增加开销。太大则想对的降低了伙伴算法的效果，这是一个迭代的过程*/
if (!cachep->num)
return -E2BIG;

freelist_size = calculate_freelist_size(cachep->num, cachep->align);

/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/
/*确定到底是将slab的头部放到slab的外边还是里边*/
if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
flags &= ~CFLGS_OFF_SLAB;
left_over -= freelist_size;
}

if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
freelist_size = calculate_freelist_size(cachep->num, 0);

#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
* poisoning, then it's going to smash the contents of
* the redzone and userword anyhow, so switch them off.
*/
if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
}

cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
if (cachep->colour_off < cachep->align)
cachep->colour_off = cachep->align;
cachep->colour = left_over / cachep->colour_off;
cachep->freelist_size = freelist_size;
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
cachep->allocflags |= GFP_DMA;
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);

if (flags & CFLGS_OFF_SLAB) {
cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
/*
* This is a possibility for one of the kmalloc_{dma,}_caches.
* But since we go off slab only for object size greater than
* PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
* in ascending order,this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
}

err = setup_cpu_cache(cachep, gfp);
if (err) {
__kmem_cache_shutdown(cachep);
return err;
}
/*创建cachep缓存完*/
return 0;
}

[/c]

关于likely 与 unlikely 的一点说明：
在linux kernel 中，我们经常可以看到likely和unlikely，这其实是两个宏。
#define likely(x) (__builtin_constant_p(x) ?!!(x) :__branch_check__(x,1))
#define unlikely(x) (__builtin_constant_p(x)?!!(x) :__branch_check__(x,0))
这里其实是对编译的一个优化，这里指出了哪一种情况发生的概率更大，对于编译器来说，编译器会将最可能发生的以种情况直接放在判断语句的下方，以求模拟出最真实的一种效果：最可能的条件最快执行不发生指令跳转。

查看原文：http://zmr.lezifang.cn/2015/11/11/%e5%86%85%e5%ad%98%e7%ae%a1%e7%90%86%e5%99%a8%ef%bc%88%e5%8d%81%e4%ba%94%ef%bc%89kernel%e5%86%85%e5%ad%98%e7%ae%a1%e7%90%86-slab%e8%ae%be%e8%ae%a1%e4%b8%8e%e5%ae%9e%e7%8e%b02%ef%bc%88%e5%88%9d/