5.1 创建slab描述符

struct kmem_cache数据结构是slab分配器中的核心数据结构,我们把它称为slab描述符。struct kmem_cache数据结构定义如下:

每个slab描述符都由一个struct kmem_cache数据结构来抽象描述。

include/linux/slab_def.h

/*
 * Definitions unique to the original Linux SLAB allocator.
 */

struct kmem_cache {
    struct array_cache __percpu *cpu_cache;/*一个per-CPU的struct array_cache数据结构,每个CPU一个,表示本地CPU的对象缓冲池*/

/* 1) Cache tunables(可调整的缓存参数). Protected by slab_mutex */
    unsigned int batchcount;/*表示当前CPU的本地对象缓冲池array_cache为空时,从共享的缓冲池或者slabs_partial/slabs_free
    列表中获取对象的数目,batch的意思是批量*/
    unsigned int limit; /*当本地对象缓冲池的空闲对象数目大于limit时就会主动释放batchcount个对象,便于内核回收和销毁slab*/
    unsigned int shared;/*用于多核系统*/

    unsigned int size;/*对象的长度,这个长度要加上align对齐字节*/
    struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend 后端每次分配和释放内存都会访问*/

    unsigned int flags;     /* constant flags 对象的分配掩码,如果管理结构储存在slab外部,则置位CFGLS_OFF_SLAB*/
    unsigned int num;       /* # of objs per slab  一个slab中最多可以有多少个对象*/

/* 3) cache_grow/shrink 缓存的增加/缩减*/
    /* order of pgs per slab (2^n) 一个slab中占用2^gfporder个页面*/
    unsigned int gfporder;

    /* force GFP flags, e.g. GFP_DMA */
    gfp_t allocflags;

    size_t colour;          /* cache colouring range 一个slab中有几个不同的cache line,指定了颜色的最大数目。多少个colour_off*/
    unsigned int colour_off;    /* colour offset 一个cache colour的长度,和L1 cache line大小相同*/
    struct kmem_cache *freelist_cache; /*当slab管理数据结构外置时,用于存放freelist_size个字节的obj索引值*/
    unsigned int freelist_size; /*page->freelist的大小为freelist_size, 存放对象的索引值。从0-num,每个值占用1个字节*/

    /* constructor func */
    void (*ctor)(void *obj);

/* 4) cache creation/removal */
    const char *name; /*slab描述符的名称*/
    struct list_head list;
    int refcount;
    int object_size; /*缓存中对象的长度,包括用于对齐目的的所有填充字节*/
    int align; /*对齐的长度*/

    struct kmem_cache_node *node[MAX_NUMNODES]; /*在NUMA系统中每个节点有一个 struct kmem_cache_node数据结构
    在ARM Vexpress平台中,只有一个节点*/
};

slab描述符给每个CPU都提供一个对象缓存池(array_cache)

struct array_cache数据结构定义如下:

struct array_cache {
    unsigned int avail;/*本地对象缓存池中可用的对象数目*/
    unsigned int limit; /*当本地对象缓冲池的空闲对象数目大于limit时就会主动释放batchcount个对象,便于内核回收和销毁slab*/
    unsigned int batchcount;/*表示当前CPU的本地对象缓冲池array_cache为空时,从共享的缓冲池或者slabs_partial/slabs_free
    列表中获取对象的数目*/
    unsigned int touched;/*从缓存池移除一个对象时,将touched置1,而收缩缓存时,将touched置0,表示这个对象缓冲池最近使用过*/
    void *entry[];  /* 保存对象的实体,是一个伪数组,其中并没有数组项,只是为了便于访问内存中array_cache实例之后缓存中的各个对象而已
             * Must have this definition in here for the proper
             * alignment of array_cache. Also simplifies accessing
             * the entries.
             *
             * Entries should not be directly dereferenced as
             * entries belonging to slabs marked pfmemalloc will
             * have the lower bits set SLAB_OBJ_PFMEMALLOC
             */
};
* The slab lists for all objects.
 */
struct kmem_cache_node {
    spinlock_t list_lock;

#ifdef CONFIG_SLAB
    struct list_head slabs_partial; /* partial list first, better asm code */
    struct list_head slabs_full;
    struct list_head slabs_free;
    unsigned long free_objects;
    unsigned int free_limit;
    unsigned int colour_next;   /* Per-node cache coloring 各个结点缓存着色,内核建立的下一个slab的颜色*/
    struct array_cache *shared; /* shared per node 结点内共享,即共享缓冲池*/
    struct alien_cache **alien; /* on other nodes  在其他结点上共享*/
    unsigned long next_reap;    /* updated without locking 无需锁定即可更新,定义了内核在
    两次尝试收缩缓存之间,必须经过的时间间隔。其想法是防止由于频繁的缓存收缩和增长操作而降低系统性能,这种操作可能在某些系统负荷下发生。
    该技术只有在NUMA系统上使用。*/
    int free_touched;       /* updated without locking 无需锁定即可更新, 表示缓存是否是活动的。在从缓存获取一个对象时,
    内存将该变量的值设置为1,在缓存收缩时,该值重置为0。但内核只有在free_touched预先设置为0时,
    才会收缩缓存。因为1表示内核的另一部分刚从该缓存获取对象,此时是不合适的。*/
#endif

};

初始化:

    为初始化slab数据结构,内核需要若干个小于一整页的内存块,这些最适合由kmalloc分配。这里是关键所在:只在slab系统已经启用之后,才能使用kmalloc。

    更确切地说,该问题涉及kamlloc的per-CPU缓存的初始化。在这些缓存能够初始化之前,kmalloc必须可以用来分配所需的内存空间,而kmalloc自身也正处于初始化的过程中。换句话说,kmalloc只能在kmalloc已经初始化之后初始化,这是个不可能的场景。因此内核必须借助一些技巧。

    kmem_cache_init()函数用于初始化slab分配器。它在内核初始化阶段(start_kernel)、伙伴系统启用后调用。但在多处理器系统上,启动CPU此时正在运行,而其它CPU尚未初始化。kmem_cache_init()采用了一个多步骤过程,逐步激活slab分配器。

  1. kmem_cache_init()创建系统中的第一个slab缓存,以便为kmem_cache的实例提供内存。为此,内核使用的主要是编译时创建的静态数据。实际上,一个静态数据结构(kmem_cache_boot)用作per-CPU数组,缓存的名字为kmem_cache_boot。

  2. kmem_cache_init()接下来初始化一般性的缓存,用作kmalloc内存的来源。为此,针对所需的各个缓存长度,分配调用create_boot_cache()->create_boot_cache()->__kmem_cache_create()。此函数会从per-cpu内存里面分配内存,但是在初始化slab的per-CPU缓存时,需要借助于kmalloc,这尚且不可能。

    为解决该问题,内核使用了slab_state变量,可以接收以下4个值(DOWN、PARTIAL、PARTIAL_NODE、UP、FULL),以反映kmalloc初始化的状态。

    最初内核的状态是NONE, 在最小的kmalloc缓存

/*
 * Initialisation.  Called after the page allocator have been initialised and
 * before smp_init().
 */
void __init kmem_cache_init(void)
{
    int i;

    BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
                    sizeof(struct rcu_head));
    kmem_cache = &kmem_cache_boot;

    if (num_possible_nodes() == 1)
        use_alien_caches = 0;

    /*初始化静态定义的struct kmem_cache_node 实例,这里有两个node实例,它们分别为前两个kmem cache实例准备的,第一个kmem cache
            实例用于创建其他kmem cache实例分配空间,第二个kmem cache实例用于为创建struct kmem_cache_node实例分配空间,所以前两个kmem cache
            实例需要静态分配struct kmem_cache_node实例*/
    for (i = 0; i < NUM_INIT_LISTS; i++)
        kmem_cache_node_init(&init_kmem_cache_node[i]);

    /*
     * Fragmentation resistance on low memory - only use bigger
     * page orders on machines with more than 32MB of memory if
     * not overridden on the command line.
     */
    if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
        slab_max_order = SLAB_MAX_ORDER_HI;

    /* Bootstrap is tricky, because several objects are allocated
     * from caches that do not exist yet:
     * 1) initialize the kmem_cache cache: it contains the struct
     *    kmem_cache structures of all caches, except kmem_cache itself:
     *    kmem_cache is statically allocated.
     *    Initially an __init data area is used for the head array and the
     *    kmem_cache_node structures, it's replaced with a kmalloc allocated
     *    array at the end of the bootstrap.
     * 2) Create the first kmalloc cache.
     *    The struct kmem_cache for the new cache is allocated normally.
     *    An __init data area is used for the head array.
     * 3) Create the remaining kmalloc caches, with minimally sized
     *    head arrays.
     * 4) Replace the __init data head arrays for kmem_cache and the first
     *    kmalloc cache with kmalloc allocated arrays.
     * 5) Replace the __init data for kmem_cache_node for kmem_cache and
     *    the other cache's with kmalloc allocated memory.
     * 6) Resize the head arrays of the kmalloc caches to their final sizes.
     */
引导程序很棘手,因为从尚不存在的缓存中分配了多个对象:
1)初始化kmem_cache缓存:它包含所有缓存的kmem_cache结构,但kmem_cache本身除外:kmem_cache是静态分配的。 最初,__ init数据区用于头数组和kmem_cache_node结构,
在引导程序末尾将其替换为kmalloc分配的数组。
2)创建第一个kmalloc缓存。通常为新缓存分配struct kmem_cache。 __init数据区用于头数组。
3)创建剩余的kmalloc缓存,并使用最小大小的头数组。
4)将kmem_cache和第一个kmalloc缓存的__init数据头数组替换为kmalloc分配的数组。
5)将kmem_cache_node的__init数据替换为kmem_cache,并将其他缓存替换为kmalloc分配的内存。
6)将kmalloc缓存的头数组调整为最终大小。
    /* 1) create the kmem_cache */

    /*
     * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
     */
    /*完成第一个kmem cache实例kmem_cache初始化,第三个参数很重要,是该kmem cache实例所维护的object的size,
    这里初始化后 kmem_cache->node[0] = init_kmem_cache_node[0]
    */
    create_boot_cache(kmem_cache, "kmem_cache",
        offsetof(struct kmem_cache, node) +
                  nr_node_ids * sizeof(struct kmem_cache_node *),
                  SLAB_HWCACHE_ALIGN);
    list_add(&kmem_cache->list, &slab_caches);
    slab_state = PARTIAL;

    /*
     * Initialize the caches that provide memory for the  kmem_cache_node
     * structures first.  Without this, further allocations will bug.
     */
    /*创建第二个kmem cache实例,从object size可以看出,该实例用于为struct kmem_cache_node分配空间。
    指针数组kmalloc_caches是kmalloc用的kmem cache实例数组,该数组十分重要。
    这里分配的kmem_cache_node就是init_kmem_cache_node[0], 初始化这个init_kmem_cache_node[0],
    此时init_kmem_cache_node[0]->node[0] = init_kmem_cache_node[1]*/

    kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
                kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
    slab_state = PARTIAL_NODE;

    slab_early_init = 0;

    /* 5) Replace the bootstrap kmem_cache_node */
    {
        int nid;

        for_each_online_node(nid) {
            /*将静态定义的init_kmem_cache_node拷贝到新分配的struct kmem_cache_node中*/
            init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);

            init_list(kmalloc_caches[INDEX_NODE],
                      &init_kmem_cache_node[SIZE_NODE + nid], nid);
        }
    }
    /*创建kmalloc所需的kmem cache实例*/
    create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
}

void __init kmem_cache_init_late(void)
{
    struct kmem_cache *cachep;

    slab_state = UP;

    /* 6) resize the head arrays to their final sizes */
    mutex_lock(&slab_mutex);
    list_for_each_entry(cachep, &slab_caches, list)
        if (enable_cpucache(cachep, GFP_NOWAIT))
            BUG();
    mutex_unlock(&slab_mutex);

    /* Done! */
    slab_state = FULL;

    /*
     * Register a cpu startup notifier callback that initializes
     * cpu_cache_get for all new cpus
     */
    register_cpu_notifier(&cpucache_notifier);

    /*
     * The reap timers are started later, with a module init call: That part
     * of the kernel is not yet operational.
     */
}

到这里为止,初始化还没有完全理解

下面的例子,在ARM Vexpress平台上创建名为"figo_object"的slab描述符,大小为20字节,align为8字节,flags为0,假设L1 Cache line大小为16字节。

kmem_cache_create()函数的实现是是在slab_common.c文件中。

/*
 * kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache. 此缓存中创建的对象的大小
 * @align: The required alignment for the objects. 对象所需的对齐方式
 * @flags: SLAB flags
 * @ctor: A constructor for the objects.
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a interrupt, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
          unsigned long flags, void (*ctor)(void *))
{
    struct kmem_cache *s;
    const char *cache_name;
    int err;

    get_online_cpus();
    get_online_mems();
    memcg_get_cache_ids();

    mutex_lock(&slab_mutex);

    err = kmem_cache_sanity_check(name, size);
    if (err) {
        s = NULL;   /* suppress uninit var warning */
        goto out_unlock;
    }

    /*
     * Some allocators will constraint the set of valid flags to a subset
     * of all flags. We expect them to define CACHE_CREATE_MASK in this
     * case, and we'll just provide them with a sanitized version of the
     * passed flags.
     */
    flags &= CACHE_CREATE_MASK;

    s = __kmem_cache_alias(name, size, align, flags, ctor);
    if (s)
        goto out_unlock;

    cache_name = kstrdup_const(name, GFP_KERNEL);
    if (!cache_name) {
        err = -ENOMEM;
        goto out_unlock;
    }

    s = do_kmem_cache_create(cache_name, size, size,
                 calculate_alignment(flags, align, size),
                 flags, ctor, NULL, NULL);
    if (IS_ERR(s)) {
        err = PTR_ERR(s);
        kfree_const(cache_name);
    }

out_unlock:
    mutex_unlock(&slab_mutex);

    memcg_put_cache_ids();
    put_online_mems();
    put_online_cpus();

    if (err) {
        if (flags & SLAB_PANIC)
            panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
                name, err);
        else {
            printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
                name, err);
            dump_stack();
        }
        return NULL;
    }
    return s;
}

首先通过__kmem_cache_alias()函数查找是否有现成的slab描述符可以复用,若没有就通过do_kmem_cache_create()来创建一个新的slab描述符

kmem_cache_create()->do_kmem_cache_create()

static struct kmem_cache *
do_kmem_cache_create(const char *name, size_t object_size, size_t size,
             size_t align, unsigned long flags, void (*ctor)(void *),
             struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
    struct kmem_cache *s;
    int err;

    err = -ENOMEM;
    s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);/*首先分配一个struct kmem_cache 数据结构*/
    if (!s)
        goto out;
    /*填充name/size/align等值*/
    s->name = name;
    s->object_size = object_size;
    s->size = size;
    s->align = align;
    s->ctor = ctor;

    err = init_memcg_params(s, memcg, root_cache);
    if (err)
        goto out_free_cache;

    /*创建slab缓冲区,下面查看此函数实现*/
    err = __kmem_cache_create(s, flags);
    if (err)
        goto out_free_cache;

    s->refcount = 1;
    list_add(&s->list, &slab_caches);/*将这个新创建的slab描述符加入到全局链表slab_caches中*/
out:
    if (err)
        return ERR_PTR(err);
    return s;

out_free_cache:
    destroy_memcg_params(s);
    kmem_cache_free(kmem_cache, s);
    goto out;
}

__kmem_cache_create()函数实现:创建slab缓冲区,只是初始化必要的参数,真正从slab分配内存不在这里。

[kmem_cache_create()->do_kmem_cache_create()->__kmem_cache_create()]

/**
 * __kmem_cache_create - Create a cache.
 * @cachep: cache management descriptor
 * @flags: SLAB flags
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a int, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory./*用已知的测试模块(a5a5a5a5),以捕获未初始化的内存的引用。即
    在建立和释放slab时,将对象用预定义的模式填充,如果对象分配时注意到,可能某些代码访问了不属于它们的内存区*/
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.在分配的内存周围插入"Red"区,以检查缓存溢出。即在每个对象的开始和结束处增加一个额外的内存区,
    其中填充已知的字节模式。如果模式被修改,程序员在分析内核内存时注意到,可能某些代码访问了不属于它们的内存区。
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
    size_t left_over, freelist_size;
    size_t ralign = BYTES_PER_WORD;
    gfp_t gfp;
    int err;
    size_t size = cachep->size;

    /*
     * Check that size is in terms of words.  This is needed to avoid
     * unaligned accesses for some archs when redzoning is used, and makes
     * sure any on-slab bufctl's are also correctly aligned.
        首先检查size是否和系统的word长度对齐(BYTES_PER_WORD).在ARM Vexpress平台中,BYTES_PER_WORD 为4Byte,
        我们例子的size为20Byte,所以和BYTES_PER_WORD 对齐
     */
    if (size & (BYTES_PER_WORD - 1)) {
        size += (BYTES_PER_WORD - 1);
        size &= ~(BYTES_PER_WORD - 1);
    }

    if (flags & SLAB_RED_ZONE) {
        ralign = REDZONE_ALIGN;
        /* If redzoning, ensure that the second redzone is suitably
         * aligned, by adjusting the object size accordingly. */
        size += REDZONE_ALIGN - 1;
        size &= ~(REDZONE_ALIGN - 1);
    }

    /* 3) caller mandated alignment 计算align对齐大小,在我们的例子中cachep->align值为8Byte*/
    if (ralign < cachep->align) {
        ralign = cachep->align;
    }
    /* disable debug if necessary */
    if (ralign > __alignof__(unsigned long long))
        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
    /*
     * 4) Store it.
     */
    cachep->align = ralign;

    /*枚举slab_state用于表示slab系统中的状态,例如DOWN、PARTIAL、PARTIAL_NODE、UP和FULL等,当slab机制完全初始化完成后
    状态变成FULL。slab_is_available() 表示当slab状态在UP或者FULL时,分配掩码可以使用GFP_KERNEL,否则只能使用GFP_NOWAIT,参考后面的代码讲解此枚举类型*/
    if (slab_is_available())
        gfp = GFP_KERNEL;
    else
        gfp = GFP_NOWAIT;

    /*
     * Determine if the slab management is 'on' or 'off' slab.
     * (bootstrapping cannot cope with offslab caches so don't do
     * it too early on. Always use on-slab management when
     * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
        确定slab管理是 打开还是关闭,是否将管理数据数外置
     */
    /*当需要分配slab缓存区对象的大小大于等于128Byte时,slab系统认为对象的大小比较大,那么分配掩码要设置CFLGS_OFF_SLAB标志*/
    if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
        !(flags & SLAB_NOLEAKTRACE))
        /*
         * Size is large, assume best to place the slab management obj
         * off-slab (should allow better packing of objs).
         */
        flags |= CFLGS_OFF_SLAB;

    size = ALIGN(size, cachep->align);/*根据size和align对齐关系,计算出最终的size大小。
       在我们的例子中,size为20 Byte,align为8 Byte,所以最终大小为24 Byte*/
    /*
     * We should restrict the number of objects in a slab to implement
     * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
     */
    if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
        size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
        
     /*下面将列出此函数,计算相关的核心函数:计算一个slab需要多少个物理页面,同时也计算slab中可以容纳多少个对象,并且需要多少着色区*/
    left_over = calculate_slab_order(cachep, size, cachep->align, flags);

    if (!cachep->num)
        return -E2BIG;
    
    freelist_size = calculate_freelist_size(cachep->num, cachep->align);/*168*/

    /*
     * If the slab has been placed off-slab, and we have enough space then
     * move it on-slab. This is at the expense of any extra colouring.
     */
    if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
        flags &= ~CFLGS_OFF_SLAB;
        left_over -= freelist_size;
    }

    if (flags & CFLGS_OFF_SLAB) {
        /* really off slab. No need for manual alignment */
        freelist_size = calculate_freelist_size(cachep->num, 0);

#ifdef CONFIG_PAGE_POISONING
        /* If we're going to use the generic kernel_map_pages()
         * poisoning, then it's going to smash the contents of
         * the redzone and userword anyhow, so switch them off.
         */
        if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
            flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
    }

    /*cache_line_size() 得出L1 cache行的大小,ARM Vexpress 平台采用Cortex-A9 处理器,L1 cache line大小可以配置
        成16B、32B或者64B。
    */
    cachep->colour_off = cache_line_size();
    /* Offset must be a multiple of the alignment. */
    if (cachep->colour_off < cachep->align)
        cachep->colour_off = cachep->align;
    /*计算cache colour的大小,用left_over除以L1 cache行大小,即left_over可以包含多少个L1 cache行。假设L1 Cache line大小
    配置为16Byte,在我们这个例子中,只能包含1个cache行,如果L1 cache line 大小配置为64Byte,cache colour就不起作用了。*/
    cachep->colour = left_over / cachep->colour_off;
    cachep->freelist_size = freelist_size;
    cachep->flags = flags;
    cachep->allocflags = __GFP_COMP;
    if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
        cachep->allocflags |= GFP_DMA;
    cachep->size = size;
    cachep->reciprocal_buffer_size = reciprocal_value(size);

    if (flags & CFLGS_OFF_SLAB) {
        cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
        /*
         * This is a possibility for one of the kmalloc_{dma,}_caches.
         * But since we go off slab only for object size greater than
         * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
         * in ascending order,this should not happen at all.
         * But leave a BUG_ON for some lucky dude.
         */
        BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
    }

    /*最后调用setup_cpu_cache函数继续配置slab描述符。
    假设slab_state为FULL,即slab机制已经初始化完成,
    内部直接调用enable_cpucache()函数,
    下面查看此函数实现*/
    err = setup_cpu_cache(cachep, gfp);
    if (err) {
        __kmem_cache_shutdown(cachep);
        return err;
    }

    return 0;
}
/*
 * State of the slab allocator.
 *
 * This is used to describe the states of the allocator during bootup.
 * Allocators use this to gradually bootstrap themselves. Most allocators
 * have the problem that the structures used for managing slab caches are
 * allocated from slab caches themselves.

    slab分配器的状态
    用于描述启动过程中分配器的状态
    分配者使用它来逐步引导自己。大多是分配器都有一个问题,即用于管理slab缓存的结构是从slab缓存本身分配的。
    
 */
enum slab_state {
    DOWN,           /* No slab functionality yet slab功能尚未完成*/
    PARTIAL,        /* SLUB: kmem_cache_node available SLUB:kmem_cache_node 函数可用*/
    PARTIAL_NODE,       /* SLAB: kmalloc size for node struct available。SLAB:节点结构kmalloc大小可用*/
    UP,         /* Slab caches usable but not all extras yet ,slab缓存可用,但是还没有所有的功能*/
    FULL            /* Everything is working 一切正常*/
};

calculate_slab_order()函数实现:计算一个slab需要多少个page,并且计算一个slab可以容纳多少个对象。

[kmem_cache_create()->do_kmem_cache_create()->__kmem_cache_create()->calculate_slab_order()]

/**
 * calculate_slab_order - calculate size (page order) of slabs 计算slab的大小(page order)
 * @cachep: pointer to the cache that is being created
 * @size: size of objects to be created in this cache. 要在此缓存中创建的对象的大小
 * @align: required alignment for the objects. 对象对齐大小
 * @flags: slab allocation flags
 *
 * Also calculates the number of objects per slab. 还要计算每个slab的对象数量
 *
 * This could be made much more intelligent.  For now, try to avoid using
 * high order pages for slabs.  When the gfp() functions are more friendly
 * towards high-order requests, this should be changed.
 */
static size_t calculate_slab_order(struct kmem_cache *cachep,
            size_t size, size_t align, unsigned long flags)
{
    unsigned long offslab_limit;
    size_t left_over = 0;
    int gfporder;

    /*for循环里首先从0 开始计算最合适的gfporder值,最多支持的页面数2^KMALLOC_MAX_ORDER, slab分配器中KMALLOC_MAX_ORDER 为25,
    所以一个slab的大小最大为2^25个页面,即32MB大小。最终计算出来KMALLOC_MAX_ORDER = 10 方法在下文给出*/
    for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
        unsigned int num;
        size_t remainder;

        /*计算在2^gfporder个页面大小的情况下,可以容纳多少个obj对象,然后剩下的空间用于cache colour着色,下文将列出此函数实现
        最终计算完成后slab对象个数为cachep->num = 163(一个slab中最多对象的个数), cachep->gfporder = 0, left_over = 16(剩
        余的字节数,用于cache color),freelist_size = 168.下面查看此函数实现
        */
        cache_estimate(gfporder, size, align, flags, &remainder, &num);
        /* remainder 即 left_over表示还剩多少个对象的空间,这里是16,表示16*24字节可用,num表示可用对象的个数 值为163*/
        if (!num)
            continue;

        /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
        if (num > SLAB_OBJ_MAX_NUM)
            break;

        if (flags & CFLGS_OFF_SLAB) {
            size_t freelist_size_per_obj = sizeof(freelist_idx_t);
            /*
             * Max number of objs-per-slab for caches which
             * use off-slab slabs. Needed to avoid a possible
             * looping condition in cache_grow().
             */
            if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
                freelist_size_per_obj += sizeof(char);
            offslab_limit = size;
            offslab_limit /= freelist_size_per_obj;

            if (num > offslab_limit)
                break;
        }

        /* Found something acceptable - save it away */
        cachep->num = num;
        cachep->gfporder = gfporder;
        left_over = remainder;

        /*
         * A VFS-reclaimable slab tends to have most allocations
         * as GFP_NOFS and we really don't want to have to be allocating
         * higher-order pages when we are unable to shrink dcache.
         */
        if (flags & SLAB_RECLAIM_ACCOUNT)
            break;

        /*
         * Large number of objects is good, but very large slabs are
         * currently bad for the gfp()s.
         */
        if (gfporder >= slab_max_order)
            break;

        /*
         * Acceptable internal fragmentation?
         */
        if (left_over * 8 <= (PAGE_SIZE << gfporder))
            break;
    }
    return left_over;
}
回到__kmem_cache_create()函数

一个slab由2^gfporder 个连续物理页面组成,包含了num个slab对象、着色区和freelist区

 

KMALLOC_MAX_ORDER的计算方法如下:

#ifdef CONFIG_SLAB
/*
 * The largest kmalloc size supported by the SLAB allocators is
 * 32 megabyte (2^25) or the maximum allocatable page order if that is
 * less than 32 MB.
 *
 * WARNING: Its not easy to increase this value since the allocators have
 * to do various tricks to work around compiler limitations in order to
 * ensure proper constant folding.
 */
#define KMALLOC_SHIFT_HIGH  ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
                (MAX_ORDER + PAGE_SHIFT - 1) : 25)
#define KMALLOC_SHIFT_MAX   KMALLOC_SHIFT_HIGH
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW   5
#endif
#endif

#define MAX_ORDER 11
#define PAGE_SHIFT  12
/* Maximum order allocatable via the slab allocagtor */
#define KMALLOC_MAX_ORDER   (KMALLOC_SHIFT_MAX - PAGE_SHIFT)
计算KMALLOC_SHIFT_HIGH  = (11+12 -1 )<= 25 ? (11+12-1): 25 => 22<=25?22:25 => 22
KMALLOC_MAX_ORDER   = (22 - 12) = 10

cache_estimate()函数实现:计算在2^gfporder个页面大小的情况下,可以容纳多少个obj对象

计算公式:

obj_num(对象个数) = slab_size / (buffer_size + sizeof(free_idx_t))

left_over(剩余字节数) = slab_size - (对象个数*buffer_size) - free_idx_t大小

[kmem_cache_create()->do_kmem_cache_create()->__kmem_cache_create()->calculate_slab_order()->cache_estimate()]

1 = sizeof(freelist_idx_t)
计算 gfpoder - 0 , buffer_size = 24, align = 8
slab_size = 4096
nr_objs =163
left_over = 16
/*
 * Calculate the number of objects and left-over bytes for a given buffer size.
 */
static void cache_estimate(unsigned long gfporder, size_t buffer_size,
               size_t align, int flags, size_t *left_over,
               unsigned int *num)
{
    int nr_objs;
    size_t mgmt_size;
    size_t slab_size = PAGE_SIZE << gfporder; // 4096

    if (flags & CFLGS_OFF_SLAB) {
        mgmt_size = 0;
        nr_objs = slab_size / buffer_size;

    } else {
        /*nr_objs = 163*/
        nr_objs = calculate_nr_objs(slab_size, buffer_size,
                    sizeof(freelist_idx_t), align);
        /*mgmt_size = 168*/
        mgmt_size = calculate_freelist_size(nr_objs, align);
    }
    *num = nr_objs;
    *left_over = slab_size - nr_objs*buffer_size - mgmt_size;/*4096 - 24*163-168 = 16 */
}

//slab_size = 4096, buffer_size = 24, idx_size = 1, align = 8

static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
                size_t idx_size, size_t align)
{
    int nr_objs;
    size_t remained_size;
    size_t freelist_size;
    int extra_space = 0;

    if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
        extra_space = sizeof(char);
    /*
     * Ignore padding for the initial guess. The padding
     * is at most @align-1 bytes, and @buffer_size is at
     * least @align. In the worst case, this result will
     * be one greater than the number of objects that fit
     * into the memory allocation when taking the padding
     * into account.
     */
    //nr_objs = 4096/(24+1) = 163
    nr_objs = slab_size / (buffer_size + idx_size + extra_space);

    /*
     * This calculated number will be either the right
     * amount, or one greater than what we want.
     */
    //remained_size = 4096 - 163*24 = 184
    remained_size = slab_size - nr_objs * buffer_size;
    freelist_size = calculate_freelist_size(nr_objs, align); /*计算出来freelist_size = 168*/
    if (remained_size < freelist_size)
        nr_objs--;

    return nr_objs;
}
//nr_objs = 163, align = 8

static size_t calculate_freelist_size(int nr_objs, size_t align)
{
    size_t freelist_size;
        /*freelist_size = 163*1 = 163*/
    freelist_size = nr_objs * sizeof(freelist_idx_t);
    if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
        freelist_size += nr_objs * sizeof(char);

    if (align)
        freelist_size = ALIGN(freelist_size, align); /*freelist_size = (163+7)&~7 = 168*/

    return freelist_size;
}
回到calculate_slab_order()函数

setup_cpu_cache()函数实现:

[__kmem_cache_create()->setup_cpu_cache()->enable_cpucache()]

static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
    if (slab_state >= FULL)
        return enable_cpucache(cachep, gfp);

    cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
    if (!cachep->cpu_cache)
        return 1;

    if (slab_state == DOWN) {
        /* Creation of first cache (kmem_cache). */
        set_up_node(kmem_cache, CACHE_CACHE);
    } else if (slab_state == PARTIAL) {
        /* For kmem_cache_node */
        set_up_node(cachep, SIZE_NODE);
    } else {
        int node;

        for_each_online_node(node) {
            cachep->node[node] = kmalloc_node(
                sizeof(struct kmem_cache_node), gfp, node);
            BUG_ON(!cachep->node[node]);
            kmem_cache_node_init(cachep->node[node]);
        }
    }

    cachep->node[numa_mem_id()]->next_reap =
            jiffies + REAPTIMEOUT_NODE +
            ((unsigned long)cachep) % REAPTIMEOUT_NODE;

    cpu_cache_get(cachep)->avail = 0;
    cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
    cpu_cache_get(cachep)->batchcount = 1;
    cpu_cache_get(cachep)->touched = 0;
    cachep->batchcount = 1;
    cachep->limit = BOOT_CPUCACHE_ENTRIES;
    return 0;
}


/* Called with slab_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
    int err;
    int limit = 0;
    int shared = 0;
    int batchcount = 0;

    if (!is_root_cache(cachep)) {
        struct kmem_cache *root = memcg_root_cache(cachep);
        limit = root->limit;
        shared = root->shared;
        batchcount = root->batchcount;
    }

    if (limit && shared && batchcount)
        goto skip_setup;
    /*
     * The head array serves three purposes:
     * - create a LIFO ordering, i.e. return objects that are cache-warm
     * - reduce the number of spinlock operations.
     * - reduce the number of linked list operations on the slab and
     *   bufctl chains: array operations are cheaper.
     * The numbers are guessed, we should auto-tune as described by
     * Bonwick.
        根据对象的大小来计算空闲对象的最大阈值limit,这里limit默认选择120.
     */
    if (cachep->size > 131072)
        limit = 1;
    else if (cachep->size > PAGE_SIZE)
        limit = 8;
    else if (cachep->size > 1024)
        limit = 24;
    else if (cachep->size > 256)
        limit = 54;
    else
        limit = 120;

    /*
     * CPU bound tasks (e.g. network routing) can exhibit cpu bound
     * allocation behaviour: Most allocs on one cpu, most free operations
     * on another cpu. For these cases, an efficient object passing between
     * cpus is necessary. This is provided by a shared array. The array
     * replaces Bonwick's magazine layer.
     * On uniprocessor, it's functionally equivalent (but less efficient)
     * to a larger limit. Thus disabled by default.
        在SMP系统中且slab对象大小不大于一个页面的情况下,shared这个变量设置为8
     */
    shared = 0;
    if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
        shared = 8;

#if DEBUG
    /*
     * With debugging enabled, large batchcount lead to excessively long
     * periods with disabled local interrupts. Limit the batchcount
     */
    if (limit > 32)
        limit = 32;
#endif
    batchcount = (limit + 1) / 2; /*计算batchcount数目,通常是最大阈值limit的一半,batchcount一般用于本地缓存池和共享缓存池之间填充本地CPU对象的数量。*/
skip_setup:
    err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);/*继续调用do_tune_cpucache()函数来配置slab描述符*/
    if (err)
        printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
               cachep->name, -err);
    return err;
}

do_tune_cpucache()函数实现:

__kmem_cache_create()->setup_cpu_cache()->enable_cpucache()->do_tune_cpucache()

static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
                int batchcount, int shared, gfp_t gfp)
{
    int ret;
    struct kmem_cache *c;

    ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);

    if (slab_state < FULL)
        return ret;

    if ((ret < 0) || !is_root_cache(cachep))
        return ret;

    lockdep_assert_held(&slab_mutex);
    for_each_memcg_cache(c, cachep) {
        /* return value determined by the root cache only */
        __do_tune_cpucache(c, limit, batchcount, shared, gfp);
    }

    return ret;
}

/* Always called with the slab_mutex held */
static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
                int batchcount, int shared, gfp_t gfp)
{
    struct array_cache __percpu *cpu_cache, *prev;
    int cpu;
    
    /*首先通过alloc_kmem_cache_cpus()函数来分配Per-CPU类型的struct array_cache指针,即系统每个CPU有一个struct array_cache指针。
    当前CPU的array_cache称为本地对象缓冲池,另外还有一个概念为共享对象缓冲池。*/
    cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);
    if (!cpu_cache)
        return -ENOMEM;

    prev = cachep->cpu_cache;
    cachep->cpu_cache = cpu_cache;/*刚分配的对象缓冲池cpu_cache会被设置为slab描述符的本地对象缓冲池*/
    kick_all_cpus_sync();

    check_irq_on();
    cachep->batchcount = batchcount;
    cachep->limit = limit;
    cachep->shared = shared;

    if (!prev)
        goto alloc_node;

    for_each_online_cpu(cpu) {
        LIST_HEAD(list);
        int node;
        struct kmem_cache_node *n;
        struct array_cache *ac = per_cpu_ptr(prev, cpu);

        node = cpu_to_mem(cpu);
        n = get_node(cachep, node);
        spin_lock_irq(&n->list_lock);
        free_block(cachep, ac->entry, ac->avail, node, &list);
        spin_unlock_irq(&n->list_lock);
        slabs_destroy(cachep, &list);
    }
    free_percpu(prev);

alloc_node:
    return alloc_kmem_cache_node(cachep, gfp);/*调用alloc_kmem_cache_node()来继续初始化slab缓冲区cachep->kmem_cache_node数据结构*/
}

static struct array_cache __percpu *alloc_kmem_cache_cpus(
        struct kmem_cache *cachep, int entries, int batchcount)
{
    int cpu;
    size_t size;
    struct array_cache __percpu *cpu_cache;

    size = sizeof(void *) * entries + sizeof(struct array_cache);/*注意这里计算size时考虑到对象缓冲池的最大阈值limit,参数entries是指最大阈值limit*/
    cpu_cache = __alloc_percpu(size, sizeof(void *));/*关于percpu的内存,不懂*/

    if (!cpu_cache)
        return NULL;

    for_each_possible_cpu(cpu) {
        /*init_arraycache里设置对象缓存池的limit和batchcount,其中limit为120,batchcount为60*/
        init_arraycache(per_cpu_ptr(cpu_cache, cpu),
                entries, batchcount);
    }

    return cpu_cache;
}

__kmem_cache_create()->setup_cpu_cache()->enable_cpucache()->alloc_kmem_cache_node()

/*
 * This initializes kmem_cache_node or resizes various caches for all nodes.
 */
static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
{
    int node;
    struct kmem_cache_node *n;
    struct array_cache *new_shared;
    struct alien_cache **new_alien = NULL;

    /*for循环是遍历系统中所有的NUMA节点,在ARM Vexpress平台中只有一个内存节点。*/
    for_each_online_node(node) {

        if (use_alien_caches) {
            new_alien = alloc_alien_cache(node, cachep->limit, gfp);
            if (!new_alien)
                goto fail;
        }

        /*如果cachep->shared大于0(在多核系统中cachep->shared会大于0,这个在enable_cpucached()函数中已经初始化了,cachep->shared为8),
        通过alloc_arraycache()来分配一个共享对象缓冲池new_shared, 为多核CPU之间共享空闲缓存对象。*/
        new_shared = NULL;
        if (cachep->shared) {
            /*alloc_arraycache分配共享缓冲池,分配了struct array_cache的内存,其中entry的大小为cachep->batchcount*/
            new_shared = alloc_arraycache(node,
                cachep->shared*cachep->batchcount,
                    0xbaadf00d, gfp);
            if (!new_shared) {
                free_alien_cache(new_alien);
                goto fail;
            }
        }

        /*获取系统中的kmem_cache_node节点。在我们的例子中,kmem_cache_node节点还没分配*/
        n = get_node(cachep, node);
        if (n) {
            struct array_cache *shared = n->shared;
            LIST_HEAD(list);

            spin_lock_irq(&n->list_lock);

            if (shared)
                free_block(cachep, shared->entry,
                        shared->avail, node, &list);

            n->shared = new_shared;
            if (!n->alien) {
                n->alien = new_alien;
                new_alien = NULL;
            }
            n->free_limit = (1 + nr_cpus_node(node)) *
                    cachep->batchcount + cachep->num;
            spin_unlock_irq(&n->list_lock);
            slabs_destroy(cachep, &list);
            kfree(shared);
            free_alien_cache(new_alien);
            continue;
        }
        /*新分配一个kmem_cache_node节点,我们把kmem_cache_node节点简称为slab节点,此数据结构描述如下*/
        n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
        if (!n) {
            free_alien_cache(new_alien);
            kfree(new_shared);
            goto fail;
        }

        kmem_cache_node_init(n);
        n->next_reap = jiffies + REAPTIMEOUT_NODE +
                ((unsigned long)cachep) % REAPTIMEOUT_NODE;
        n->shared = new_shared;
        n->alien = new_alien;
        n->free_limit = (1 + nr_cpus_node(node)) *
                    cachep->batchcount + cachep->num;
        cachep->node[node] = n;
    }
    return 0;

fail:
    if (!cachep->list.next) {
        /* Cache is not active yet. Roll back what we did */
        node--;
        while (node >= 0) {
            n = get_node(cachep, node);
            if (n) {
                kfree(n->shared);
                free_alien_cache(n->alien);
                kfree(n);
                cachep->node[node] = NULL;
            }
            node--;
        }
    }
    return -ENOMEM;
}


#ifndef CONFIG_SLOB
/*
 * The slab lists for all objects.
    包括了3个slab链表,分配表示部分空闲、完全用尽、空闲。
    free_object表示上述3个链表中空闲对象的总和,free_limit表示
    所有slab上容许空闲对象的最大数目。
    slab节点还包含在一个NUMA节点中CPU之间共享的共享对象缓冲池new_shared
 */
struct kmem_cache_node {
    spinlock_t list_lock;

#ifdef CONFIG_SLAB
    struct list_head slabs_partial; /* partial list first, better asm code */
    struct list_head slabs_full;
    struct list_head slabs_free;
    unsigned long free_objects; /*3个链表中所有空闲对象数目*/
    unsigned int free_limit;    /*slab中可允许的空闲对象数目最大阈值*/
    unsigned int colour_next;   /* Per-node cache coloring */
    struct array_cache *shared; /* shared per node 在多核CPU中,除了本地CPU外,其余的CPU有一个共享的对象缓冲池 */
    struct alien_cache **alien; /* on other nodes */
    unsigned long next_reap;    /* updated without locking */
    int free_touched;       /* updated without locking */
#endif

#ifdef CONFIG_SLUB
    unsigned long nr_partial;
    struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
    atomic_long_t nr_slabs;
    atomic_long_t total_objects;
    struct list_head full;
#endif
#endif

};

至此,slab描述符的建立已经完成,下面把slab分配器中的重要数据结构重新看一下,并且把我们例子中相关数据结构的结构列出来,方便大家看代码时可以自行演算。我们这个例子为:

在ARM Vexpress平台上创建名为"figo_object"的slab描述符,大小为20Byte,align为8Byte,flags为9,假设L1 cache line大小为16字节,其slab描述符相关成员计算结果如下:

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

byd yes

你的鼓励是我最大的动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值