Linux内存管理之slab分配器分析(三创建cache)

最新推荐文章于 2023-09-19 19:50:24 发布

尚先生的博客

最新推荐文章于 2023-09-19 19:50:24 发布

阅读量346

点赞数

分类专栏： Linux内存管理

原文链接：http://blog.chinaunix.net/uid-13746440-id-4765764.html

版权

Linux内存管理专栏收录该内容

49 篇文章 14 订阅

订阅专栏

Kernel提供了kmem_cache_create函数用于创建Cache，下面我们直接从API入手。

函数有点长，逐行分析一下。

/**
 * kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @ctor: A constructor for the objects.
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a int, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * @name must be valid until the cache is destroyed. This implies that
 * the module calling this has to destroy the cache before getting unloaded.
 * Note that kmem_cache_name() is not guaranteed to return the same pointer,
 * therefore applications must manage it themselves.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline. This can be beneficial if you're counting cycles as closely
 * as davem.
 */
/× 创建成功后，cache中没有任何slab及对象，当分配对象时才会创建新的slab ×/
struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
    unsigned long flags, void (*ctor)(void *))
{
    size_t left_over, slab_size, ralign;
    struct kmem_cache *cachep = NULL, *pc;
    gfp_t gfp;

    /*
     * Sanity checks... these are all serious usage bugs.
     */
    /× cache未指定名字，在中断上下文，对象大小小于sizeof(void ×)，对象大小大于KMALLOC_MAX_SIZE，则报错 ×/
    if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
     size > KMALLOC_MAX_SIZE) {
        printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
                name);
        BUG();
    }

    /*
     * We use cache_chain_mutex to ensure a consistent view of
     * cpu_online_mask as well. Please see cpuup_callback
     */
    /× 判断slab释放已经初始化好，g_cpucache_up >= EARLY，见前文的初始化分析 
       如果是内核启动阶段，因为只有一个cpu在执行初始化的操作，所以不需要加锁 ×/
    if (slab_is_available()) {
        get_online_cpus();
        mutex_lock(&cache_chain_mutex);
    }
    /× 所有创建的cache都连接在cache_chain链表上，遍历链表检查是否有重名的cache ×/
    list_for_each_entry(pc, &cache_chain, next) {
        char tmp;
        int res;

        /*
         * This happens when the module gets unloaded and doesn't
         * destroy its slab cache and no-one else reuses the vmalloc
         * area of the module. Print a warning.
         */
        /* 检查cache是否都有名字，没有名字则告警，并跳过 */
        res = probe_kernel_address(pc->name, tmp);
        if (res) {
            printk(KERN_ERR
             "SLAB: cache with size %d has lost its name\n",
             pc->buffer_size);
            continue;
        }
        /× 检查是否存在名字冲突的cache ×/
        if (!strcmp(pc->name, name)) {
            printk(KERN_ERR
             "kmem_cache_create: duplicate cache %s\n", name);
            dump_stack();
            goto oops;
        }
    }

#if DEBUG    // 调试，跳过
    WARN_ON(strchr(name, ' '));    /* It confuses parsers */
#if FORCED_DEBUG // 调试，跳过
    /*
     * Enable redzoning and last user accounting, except for caches with
     * large objects, if the increased size would increase the object size
     * above the next power of two: caches with object sizes just above a
     * power of two have a significant amount of internal fragmentation.
     */
    if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
                        2 * sizeof(unsigned long long)))
        flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
    if (!(flags & SLAB_DESTROY_BY_RCU))
        flags |= SLAB_POISON;
#endif
    if (flags & SLAB_DESTROY_BY_RCU)
        BUG_ON(flags & SLAB_POISON);
#endif
    /*
     * Always checks flags, a caller might be expecting debug support which
     * isn't available.
     */
    BUG_ON(flags & ~CREATE_MASK);

    /*
     * Check that size is in terms of words. This is needed to avoid
     * unaligned accesses for some archs when redzoning is used, and makes
     * sure any on-slab bufctl's are also correctly aligned.
     */
    /* size 按照BYTES_PER_WORD对齐 */
    if (size & (BYTES_PER_WORD - 1)) {
        size += (BYTES_PER_WORD - 1);
        size &= ~(BYTES_PER_WORD - 1);
    }

    /* calculate the final buffer alignment: */

    /* 1) arch recommendation: can be overridden for debug */
    /× 与硬件高速缓存行的cache_line_size对齐，根据size的大小决定对齐的单位 ×/
    if (flags & SLAB_HWCACHE_ALIGN) {
        /*
         * Default alignment: as specified by the arch code. Except if
         * an object is really small, then squeeze multiple objects into
         * one cacheline.
         */
        ralign = cache_line_size();
        while (size <= ralign / 2)
            ralign /= 2;
    } else {
        ralign = BYTES_PER_WORD;
    }

    /*
     * Redzoning and user store require word alignment or possibly larger.
     * Note this will be overridden by architecture or caller mandated
     * alignment if either is greater than BYTES_PER_WORD.
     */
    if (flags & SLAB_STORE_USER)
        ralign = BYTES_PER_WORD;

    if (flags & SLAB_RED_ZONE) {
        ralign = REDZONE_ALIGN;
        /* If redzoning, ensure that the second redzone is suitably
         * aligned, by adjusting the object size accordingly. */
        size += REDZONE_ALIGN - 1;
        size &= ~(REDZONE_ALIGN - 1);
    }

    /* 2) arch mandated alignment */
    if (ralign < ARCH_SLAB_MINALIGN) {
        ralign = ARCH_SLAB_MINALIGN;
    }
    /* 3) caller mandated alignment */
    if (ralign < align) {
        ralign = align;
    }
    /* disable debug if necessary */
    if (ralign > __alignof__(unsigned long long))
        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
    /*
     * 4) Store it.
     */
    /× 存储对齐但 ×/
    align = ralign;

    /× 确定slab是否可以使用，GFP_KERNEL允许申请时睡眠 ×/
    if (slab_is_available())
        gfp = GFP_KERNEL;
    else
        /× GFP_NOWAIT，在slab初始化完成前使用，不能阻塞，只能在低端内存区分配 ×/
        gfp = GFP_NOWAIT;

    /* Get cache's description obj. */
    /× 申请kmem_cache结构，并初始化，cache_cache的对象正是struct kmem_cache结构 ×/
    cachep = kmem_cache_zalloc(&cache_cache, gfp);
    if (!cachep)
        goto oops;

#if DEBUG
    cachep->obj_size = size;

    /*
     * Both debugging options require word-alignment which is calculated
     * into align above.
     */
    if (flags & SLAB_RED_ZONE) {
        /* add space for red zone words */
        cachep->obj_offset += sizeof(unsigned long long);
        size += 2 * sizeof(unsigned long long);
    }
    if (flags & SLAB_STORE_USER) {
        /* user store requires one word storage behind the end of
         * the real object. But if the second red zone needs to be
         * aligned to 64 bits, we must allow that much space.
         */
        if (flags & SLAB_RED_ZONE)
            size += REDZONE_ALIGN;
        else
            size += BYTES_PER_WORD;
    }
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
    if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
     && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
        cachep->obj_offset += PAGE_SIZE - size;
        size = PAGE_SIZE;
    }
#endif
#endif

    /*
     * Determine if the slab management is 'on' or 'off' slab.
     * (bootstrapping cannot cope with offslab caches so don't do
     * it too early on. Always use on-slab management when
     * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
     */
    /× 确定slab管理对象时采用内置还是外置的方式，当对象大小超过512时，采用外置方式；初始化阶段使用内置方式 ×/
    if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
     !(flags & SLAB_NOLEAKTRACE))
        /*
         * Size is large, assume best to place the slab management obj
         * off-slab (should allow better packing of objs).
         */
        flags |= CFLGS_OFF_SLAB;
    /× 按照之前计算的对齐单元，调整size的大小 ×/
    size = ALIGN(size, align);
    /× 计算slab中碎片的大小 ×/
    left_over = calculate_slab_order(cachep, size, align, flags);
    /× num代表了当前cache允许每个slab中存在的对象数，正常不应该为0 ×/
    if (!cachep->num) {
        printk(KERN_ERR
         "kmem_cache_create: couldn't create cache %s.\n", name);
        kmem_cache_free(&cache_cache, cachep);
        cachep = NULL;
        goto oops;
    }
    /× 计算slab管理对象的大小，包括slab和kmem_bufctl_t ×/
    slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
             + sizeof(struct slab), align);

    /*
     * If the slab has been placed off-slab, and we have enough space then
     * move it on-slab. This is at the expense of any extra colouring.
     */
    /× 如果碎片大小已经超过了管理对象的大小，并且是slab管理对象外置的话，可以直接移进slab中 ×/
    if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
        /× 取消外置的标签，此时是内置的 ×/
        flags &= ~CFLGS_OFF_SLAB;
        /× 碎片的大小可以减去管理对象的大小了 ×/
        left_over -= slab_size;
    }
    
    /× 如果是外置的，则slab_size按照不对齐的方式重新计算一下大小 ×/
    if (flags & CFLGS_OFF_SLAB) {
        /* really off slab. No need for manual alignment */
        slab_size =
         cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);

#ifdef CONFIG_PAGE_POISONING
        /* If we're going to use the generic kernel_map_pages()
         * poisoning, then it's going to smash the contents of
         * the redzone and userword anyhow, so switch them off.
         */
        if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
            flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
    }
    /× 记录着色块的大小，cache_line_size ×/
    cachep->colour_off = cache_line_size();
    /* Offset must be a multiple of the alignment. */
    if (cachep->colour_off < align)
        cachep->colour_off = align;
    /* 计算碎片区需要多少着色块 */
    cachep->colour = left_over / cachep->colour_off;
    /* 记录slab管理对象的大小 */
    cachep->slab_size = slab_size;
    cachep->flags = flags;
    cachep->gfpflags = 0;
    /× 如果当前kernel配置了DMA，并且函数指定了DMA参数，则在cache上打上DMA的标签 ×/
    if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
        cachep->gfpflags |= GFP_DMA;
    /× 记录每个slab对象的大小 ×/
    cachep->buffer_size = size;
    /× 下面成员用于后续计算对象在slab中的索引 ×/
    cachep->reciprocal_buffer_size = reciprocal_value(size);

    if (flags & CFLGS_OFF_SLAB) {
        /× 分配一个slab管理区对象，保存在cachep->slabp_cache中 ×/
        /× 函数传入的slab_size是管理区对象的大小，如果是slab管理区是外置的，则从slab_size大小的普通cache中申请对象 ×/
        /* 这里找到对应的kmem_cache并记录下来，如果是内置的，则slabp_cache为NULL */
        cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
        /*
         * This is a possibility for one of the malloc_sizes caches.
         * But since we go off slab only for object size greater than
         * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
         * this should not happen at all.
         * But leave a BUG_ON for some lucky dude.
         */
        BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
    }
    /* 设置构造函数 */
    cachep->ctor = ctor;
    /* 记录cache的名字 */
    cachep->name = name;
    /* 设置每个cpu上的local cache */
    if (setup_cpu_cache(cachep, gfp)) {
        __kmem_cache_destroy(cachep);
        cachep = NULL;
        goto oops;
    }

    /* cache setup completed, link it into the list */
    /* cache创建完毕，将其加入全局的cache_chain上 */
    list_add(&cachep->next, &cache_chain);
oops:
    if (!cachep && (flags & SLAB_PANIC))
        panic("kmem_cache_create(): failed to create slab `%s'\n",
         name);
    /* 如果不是初始化阶段，前面曾经加了锁，此处去掉，另，释放cpu热插拔相关计数 */
    if (slab_is_available()) {
        mutex_unlock(&cache_chain_mutex);
        put_online_cpus();
    }
    return cachep;
}

下面函数计算slab由几个页面组成，以及每个slab中存在多少个对象

/**
 * calculate_slab_order - calculate size (page order) of slabs
 * @cachep: pointer to the cache that is being created
 * @size: size of objects to be created in this cache.
 * @align: required alignment for the objects.
 * @flags: slab allocation flags
 *
 * Also calculates the number of objects per slab.
 *
 * This could be made much more intelligent. For now, try to avoid using
 * high order pages for slabs. When the gfp() functions are more friendly
 * towards high-order requests, this should be changed.
 */
static size_t calculate_slab_order(struct kmem_cache *cachep,
            size_t size, size_t align, unsigned long flags)
{
    unsigned long offslab_limit;
    size_t left_over = 0;
    int gfporder;

    for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
        unsigned int num;
        size_t remainder;
        /* 计算slab中存在的对象数量和slab浪费的空间大小 */
        cache_estimate(gfporder, size, align, flags, &remainder, &num);
        /* 如果num为0，则代表当前order的页面数连一个对象都无放入，需要扩大页面数 */
        if (!num)
            continue;
        /*摘抄一段网友的注释：http://blog.csdn.net/bullbat/article/details/7192845
        /* 创建一个外置式slab时，要相应分配该slab的管理对象，包含struct slab对象和kmem_bufctl_t数组，分配管理对象的流程就是分配普通对象的流程，
           再来看一下分配对象的流程：          
           kmem_cache_alloc->__cache_alloc-> __do_cache_alloc-> ____cache_alloc->
                 cache_alloc_refill->cache_grow-> alloc_slabmgmt-> kmem_cache_alloc_node-> kmem_cache_alloc 
           可以看出这里可能存在一个循环，循环的关键在于alloc_slabmgmt函数，当slab管理对象是off-slab方式时，就形成了循环
           那么什么时候slab管理对象会采用外置式slab呢？显然当其管理的slab中对象很多，从而kmem_bufctl_t数组很大，致使整个管理对象也很大，
           此时才会形成循环。故需要对kmem_bufctl_t的数目做限制，下面的算法是很粗略的，既然对象大小为size时，是外置式slab，
           那么我们假设管理对象的大小也是size，计算出kmem_bufctl_t数组的大小，即此大小的kmem_bufctl_t数组一定会造成管理对象是外置式slab。
           之所以说粗略，是指数组大小小于这个限制时，也不能确保管理对象一定是内置式slab。但这也不会引发错误，因为还有一个slab_break_gfp_order
           阀门来控制每个slab所占页面数，通常其值为1，即每个slab最多两个页面，外置式slab存放的都是大于512的大对象，所以slab中不会有太多的大对象，
           kmem_bufctl_t数组也不会很大，粗略判断一下就足够了。 
        */
        if (flags & CFLGS_OFF_SLAB) {
            /*
             * Max number of objs-per-slab for caches which
             * use off-slab slabs. Needed to avoid a possible
             * looping condition in cache_grow().
             */
            offslab_limit = size - sizeof(struct slab);
            offslab_limit /= sizeof(kmem_bufctl_t);
             /* 当前计算得到的对象数量，大于计算得到的限制时，就可以跳出循环了 */
            if (num > offslab_limit)
                break;
        }

        /* Found something acceptable - save it away */
        /* slab中的对象数量 */
        cachep->num = num;
        /* slab由几个页面组成，见cache_estimate的计算过程 */
        cachep->gfporder = gfporder;
        /* slab中存在的碎片的大小，同样在cache_estimate中计算出来 */
        left_over = remainder;

        /*
         * A VFS-reclaimable slab tends to have most allocations
         * as GFP_NOFS and we really don't want to have to be allocating
         * higher-order pages when we are unable to shrink dcache.
         */
        /* 该标签代表slab中的页面可以回收，直接跳出 */
        /* 可回收意味着当前slab占用的内存被当做可用内存看待，通过kmem_freepages可以将slab占用的页释放 */
        if (flags & SLAB_RECLAIM_ACCOUNT)
            break;

        /*
         * Large number of objects is good, but very large slabs are
         * currently bad for the gfp()s.
         */
        /* 一旦超过slab页框允许的上限，则不再继续循环，直接使用当前的gfporder */
        if (gfporder >= slab_break_gfp_order)
            break;

        /*
         * Acceptable internal fragmentation?
         */
        /* 判断一下，当前页面的利用率，当利用率满足下方条件时，不再继续循环 */
        if (left_over * 8 <= (PAGE_SIZE << gfporder))
            break;
    }
    /* 当前slab引入的碎片的大小 */
    return left_over;
}

单独再分析一下设置cpu的本地cache ，见下方函数

static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
    /* enable_cpucache前面文章已经分析过，FULL在kmem_cache_init_late中赋值，
       此时普通cache已经初始化完成了，直接配置每个cpu的local cache */
    if (g_cpucache_up == FULL)
        return enable_cpucache(cachep, gfp);
    /* g_cpucache_up代表了通用cache的初始化的进度，取值NONE/EARLY/FULL/PARTIAL_AC/PARTIAL_L3 */
    /* chicken and egg problem: delay the per-cpu array allocation until the general caches are up.
       static enum {
            NONE,            // 系统初始化阶段
            PARTIAL_AC,      // struct array_cache所在的cache已经创建
            PARTIAL_L3,      // struct kmem_list3所在的cache已经创建
            EARLY,           // kmem_cache_init阶段完成
            FULL             // kmem_cache_init_late，resize head arrays完成
       } g_cpucache_up;
    */
    if (g_cpucache_up == NONE) {
        /*
         * Note: the first kmem_cache_create must create the cache
         * that's used by kmalloc(24), otherwise the creation of
         * further caches will BUG().
         */
        /* 初始化阶段创建struct array_cache时走进这里，此时general cache尚未创建，只能使用静态的cache */
        cachep->array[smp_processor_id()] = &initarray_generic.cache;

        /*
         * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
         * the first cache, then we need to set up all its list3s,
         * otherwise the creation of further caches will BUG().
         */
        /* kmem_list3的cache也未创建，使用全局变量 */
        set_up_list3s(cachep, SIZE_AC);
        /* 更新进度 */
        if (INDEX_AC == INDEX_L3)
            g_cpucache_up = PARTIAL_L3;
        else
            g_cpucache_up = PARTIAL_AC;
    } else {
        /* general cache已经创建，使用kmalloc申请 */
        cachep->array[smp_processor_id()] =
            kmalloc(sizeof(struct arraycache_init), gfp);

        if (g_cpucache_up == PARTIAL_AC) {
            /* kmem_list3所在cache尚未创建完成，仍使用静态全局的slab三链 */
            set_up_list3s(cachep, SIZE_L3);
            /* 只有创建kmem_list3 cache时才会走进该流程，set_up_list3创建了kmem_list3的cache，更新进度 */
            g_cpucache_up = PARTIAL_L3;
        } else {
            int node;
            for_each_online_node(node) {
                /* 通过kmalloc直接申请 */
                cachep->nodelists[node] =
                 kmalloc_node(sizeof(struct kmem_list3),
                        gfp, node);
                BUG_ON(!cachep->nodelists[node]);
                /* 初始化链表 */
                kmem_list3_init(cachep->nodelists[node]);
            }
        }
    }
    cachep->nodelists[numa_node_id()]->next_reap =
            jiffies + REAPTIMEOUT_LIST3 +
            ((unsigned long)cachep) % REAPTIMEOUT_LIST3;

    cpu_cache_get(cachep)->avail = 0;
    cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
    cpu_cache_get(cachep)->batchcount = 1;
    cpu_cache_get(cachep)->touched = 0;
    cachep->batchcount = 1;
    cachep->limit = BOOT_CPUCACHE_ENTRIES;
    return 0;
}