linux slab作为用于小块内存分配的机制,主要用于低于PAGE SIZE大小内存的分配,是相对于page_alloc用于分配以页为单位的内存来说的。
slab机制主要分为slab描述符的创建,slab对象的分配,slab对象的回收。
1. slab的创建:
先看slab描述符:
struct kmem_cache {
struct array_cache __percpu *cpu_cache;
/* 1) Cache tunables. Protected by slab_mutex */
unsigned int batchcount;
unsigned int limit;
unsigned int shared;
unsigned int size;
struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */
unsigned int flags; /* constant flags */
unsigned int num; /* # of objs per slab */
/* 3) cache_grow/shrink */
/* order of pgs per slab (2^n) */
unsigned int gfporder;
/* force GFP flags, e.g. GFP_DMA */
gfp_t allocflags;
size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
struct kmem_cache *freelist_cache;
unsigned int freelist_size;
/* constructor func */
void (*ctor)(void *obj);
/* 4) cache creation/removal */
const char *name;
struct list_head list;
int refcount;
int object_size;
int align;
/* 5) statistics */
#ifdef CONFIG_MEMCG_KMEM
struct memcg_cache_params memcg_params;
#endif
struct kmem_cache_node *node[MAX_NUMNODES];
};
cpu_cache:对于多核CPU,每个CPU都会有一个对应的本地缓冲池。
batchcount:当前CPU的本地对象缓冲池array_cache为空时从shared共享缓冲池或者slabs_parcial/slabs_free列表中获取的对象个数。
limit:当本地对象缓冲池中的free对象大于limit时,就会主动释放batchcount个对象,便于回收和销毁slab。
shared:共享对象缓冲池是否存在判断条件。
size:对象的长度,这个时按照对齐计算之后的大小。
flags:对象分配的掩码。
num:一个slab最多可以容纳多少个object。
gfporder:一个slab最多可以有2^gpforder页面
allocflags:决定从哪个zone分配内存。
colour:每个slab中cache line的个数。
colour_off:cache colour的长度和CPU的cache line一样。
freelist_cache:暂时没注意到。
frelist_size:存放对象的编号数组的大小。
name:slab描述符的名字
list:slab描述符的链表。
object_size:object实际的大小。
align:对齐的长度。
node:slab节点,在UMA中每个slab描述符对应一个slab node,同时对应着三个slab 链表分别表示parcial,free,full链表。
CPU本地缓冲池struct array cache:
struct array_cache {
unsigned int avail;
unsigned int limit;
unsigned int batchcount;
unsigned int touched;
void *entry[]; /*
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
*
* Entries should not be directly dereferenced as
* entries belonging to slabs marked pfmemalloc will
* have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
avail:表示当前free的对象个数
batchcount:当前CPU的本地对象缓冲池array_cache为空时从shared共享缓冲池或者slabs_parcial/slabs_free列表中获取的对象个数。
limit:当本地对象缓冲池中的free对象大于limit时,就会主动释放batchcount个对象,便于回收和销毁slab
touched:从本地缓冲池中移除一个对象时,将touched置1
entry:保存对象的实体。
slab描述符的创建函数为
struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s = NULL;
const char *cache_name;
int err;
……………………………………
s = __kmem_cache_alias(name, size, align, flags, ctor);---------------(1)
if (s)
goto out_unlock;
cache_name = kstrdup_const(name, GFP_KERNEL);
if (!cache_name) {
err = -ENOMEM;
goto out_unlock;
}
s = create_cache(cache_name, size, size,-------------------------(2)
calculate_alignment(flags, align, size),
flags, ctor, NULL, NULL);
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
}
……………………………………
return s;
}
(1)第一个函数kmem_cache_alias用来检测是否有现成的slab描述符可以使用
(2)使用create_cache()来创建slab描述符。
static struct kmem_cache *create_cache(const char *name,
size_t object_size, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *),
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
struct kmem_cache *s;
int err;
err = -ENOMEM;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);--------------(1)
if (!s)
goto out;
s->name = name;
s->object_size = object_size;
s->size = size;
s->align = align;
s->ctor = ctor;
err = init_memcg_params(s, memcg, root_cache);
if (err)
goto out_free_cache;
err = __kmem_cache_create(s, flags);-------------(2)
if (err)
goto out_free_cache;
s->refcount = 1;
list_add(&s->list, &slab_caches);-----------------(3)
out:
if (err)
return ERR_PTR(err);
return s;
out_free_cache:
destroy_memcg_params(s);
kmem_cache_free(kmem_cache, s);
goto out;
}
(1)分配一个kmem_cache结构体
(2)调用__kmem_cache_create()函数继续往下初始化slab描述符
(3)初始化完成后将slab描述符添加到kernel的slab全局链表中去。
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
size_t left_over, freelist_size;
size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
size_t size = cachep->size;
/*
* Check that size is in terms of words. This is needed to avoid
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/
if (size & (BYTES_PER_WORD - 1)) {
size += (BYTES_PER_WORD - 1);
size &= ~(BYTES_PER_WORD - 1);------------(1)
}
if (flags & SLAB_RED_ZONE) {
ralign = REDZONE_ALIGN;
/* If redzoning, ensure that the second redzone is suitably
* aligned, by adjusting the object size accordingly. */
size += REDZONE_ALIGN - 1;
size &= ~(REDZONE_ALIGN - 1);
}
/* 3) caller mandated alignment */
if (ralign < cachep->align) {
ralign = cachep->align;
}
/* disable debug if necessary */
if (ralign > __alignof__(unsigned long long))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
*/
cachep->align = ralign;
if (slab_is_available())-------------(2)
gfp = GFP_KERNEL;
else
gfp = GFP_NOWAIT;
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
* it too early on. Always use on-slab management when
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
!(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/
flags |= CFLGS_OFF_SLAB;
size = ALIGN(size, cachep->align);--------------(3)
/*
* We should restrict the number of objects in a slab to implement
* byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
*/
if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
left_over = calculate_slab_order(cachep, size, cachep->align, flags);--------(4)
if (!cachep->num)
return -E2BIG;
freelist_size = calculate_freelist_size(cachep->num, cachep->align);----------(5)
/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/
if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
flags &= ~CFLGS_OFF_SLAB;
left_over -= freelist_size;
}
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
freelist_size = calculate_freelist_size(cachep->num, 0);
#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
* poisoning, then it's going to smash the contents of
* the redzone and userword anyhow, so switch them off.
*/
if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
}
cachep->colour_off = cache_line_size();------------(6)
/* Offset must be a multiple of the alignment. */
if (cachep->colour_off < cachep->align)
cachep->colour_off = cachep->align;
cachep->colour = left_over / cachep->colour_off;
cachep->freelist_size = freelist_size;
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
cachep->allocflags |= GFP_DMA;-----------(7)
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
if (flags & CFLGS_OFF_SLAB) {
cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
/*
* This is a possibility for one of the kmalloc_{dma,}_caches.
* But since we go off slab only for object size greater than
* OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created
* in ascending order,this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
}
err = setup_cpu_cache(cachep, gfp);---------(8)
if (err) {
__kmem_cache_shutdown(cachep);
return err;
}
return 0;
}
(1)判断分配的对象大小是否和系统的字大小对齐,如果不对齐,重新计算size的大小
(2)判断slab是否完全初始化完成,如果已经初始化完成slab_state为FULL,此时如果初始化完成则分配内存会从normal zone开始分配否则设置为KSWAPD可以回收类型。
(3)通过对齐数,重新计算size大小,比如分配大小为20Byte,align为8Byte,则重新计算的size为24Byte。
(4)初始化每个slab缓冲池占用的page个数,以及包含对象的个数,返回每个slab缓冲池中除去freelist数组以及对象占用内存大小后剩余部分
(5)计算freelist size,表示此slab描述符可以存放多少个free slab object,其大小为最多包含object个数乘以sizeof(char)也要按照align对齐。
(6)colour_off的大小为cpu的cache line大小
(7)分配内存zone的选择
(8)调用setup_cpu_cache继续初始化slab描述符。
static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
if (slab_state >= FULL)-------------(1)
return enable_cpucache(cachep, gfp);
cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
if (!cachep->cpu_cache)
return 1;
if (slab_state == DOWN) {
/* Creation of first cache (kmem_cache). */
set_up_node(kmem_cache, CACHE_CACHE);
} else if (slab_state == PARTIAL) {
/* For kmem_cache_node */
set_up_node(cachep, SIZE_NODE);
} else {
int node;
for_each_online_node(node) {
cachep->node[node] = kmalloc_node(
sizeof(struct kmem_cache_node), gfp, node);
BUG_ON(!cachep->node[node]);
kmem_cache_node_init(cachep->node[node]);
}
}
cachep->node[numa_mem_id()]->next_reap =
jiffies + REAPTIMEOUT_NODE +
((unsigned long)cachep) % REAPTIMEOUT_NODE;
cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
cpu_cache_get(cachep)->batchcount = 1;
cpu_cache_get(cachep)->touched = 0;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
return 0;
}
(1)由于slab当前已经初始化完成,固直接走enable_cpucache()函数:
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
int limit = 0;
int shared = 0;
int batchcount = 0;
if (!is_root_cache(cachep)) {
struct kmem_cache *root = memcg_root_cache(cachep);
limit = root->limit;
shared = root->shared;
batchcount = root->batchcount;
}
if (limit && shared && batchcount)
goto skip_setup;
/*
* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
* - reduce the number of spinlock operations.
* - reduce the number of linked list operations on the slab and
* bufctl chains: array operations are cheaper.
* The numbers are guessed, we should auto-tune as described by
* Bonwick.
*/
if (cachep->size > 131072)
limit = 1;
else if (cachep->size > PAGE_SIZE)
limit = 8;
else if (cachep->size > 1024)
limit = 24;
else if (cachep->size > 256)
limit = 54;
else
limit = 120;
/*
* CPU bound tasks (e.g. network routing) can exhibit cpu bound
* allocation behaviour: Most allocs on one cpu, most free operations
* on another cpu. For these cases, an efficient object passing between
* cpus is necessary. This is provided by a shared array. The array
* replaces Bonwick's magazine layer.
* On uniprocessor, it's functionally equivalent (but less efficient)
* to a larger limit. Thus disabled by default.
*/
shared = 0;
if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
shared = 8;
#if DEBUG
/*
* With debugging enabled, large batchcount lead to excessively long
* periods with disabled local interrupts. Limit the batchcount
*/
if (limit > 32)
limit = 32;
#endif
batchcount = (limit + 1) / 2;
skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);--------(1)
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err);
return err;
}
(1)enable_cpucache()函数初始化slab描述符以及CPU本地对象对象缓冲池结构图array_cache的limit和batchcount成员,调用关系如下:
enable_cpucache()->do_tune_cpucache()->__do_tune_cpucache():
static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
struct array_cache __percpu *cpu_cache, *prev;
int cpu;
cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);----------(1)
if (!cpu_cache)
return -ENOMEM;
prev = cachep->cpu_cache;
cachep->cpu_cache = cpu_cache;--------(2)
kick_all_cpus_sync();
check_irq_on();
cachep->batchcount = batchcount;
cachep->limit = limit;
cachep->shared = shared;
if (!prev)
goto alloc_node;
for_each_online_cpu(cpu) {
LIST_HEAD(list);
int node;
struct kmem_cache_node *n;
struct array_cache *ac = per_cpu_ptr(prev, cpu);
node = cpu_to_mem(cpu);
n = get_node(cachep, node);
spin_lock_irq(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
free_percpu(prev);
alloc_node:
return alloc_kmem_cache_node(cachep, gfp);--------------(3)
}
(1)分配并且初始化CPU本地对象缓冲池的结构体array_cache。
(2)将已经初始化完毕的CPUCPU本地对象缓冲池的结构体array_cache赋给slab描述符的cpu_cache成员
(3)调用alloc_kmem_cache_node()完成slab节点的初始化以及slab描述符node成员的赋值
static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
{
int node;
struct kmem_cache_node *n;
struct array_cache *new_shared;
struct alien_cache **new_alien = NULL;
for_each_online_node(node) {----------------(1)
if (use_alien_caches) {
new_alien = alloc_alien_cache(node, cachep->limit, gfp);
if (!new_alien)
goto fail;
}
new_shared = NULL;
if (cachep->shared) {-------------(2)
new_shared = alloc_arraycache(node,
cachep->shared*cachep->batchcount,
0xbaadf00d, gfp);
if (!new_shared) {
free_alien_cache(new_alien);
goto fail;
}
}
n = get_node(cachep, node);---------------(3)
if (n) {
struct array_cache *shared = n->shared;
LIST_HEAD(list);
spin_lock_irq(&n->list_lock);
if (shared)
free_block(cachep, shared->entry,
shared->avail, node, &list);
n->shared = new_shared;
if (!n->alien) {
n->alien = new_alien;
new_alien = NULL;
}
n->free_limit = (1 + nr_cpus_node(node)) *
cachep->batchcount + cachep->num;
spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
kfree(shared);
free_alien_cache(new_alien);
continue;
}
n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);----------(4)
if (!n) {
free_alien_cache(new_alien);
kfree(new_shared);
goto fail;
}
kmem_cache_node_init(n);--------------(5)
n->next_reap = jiffies + REAPTIMEOUT_NODE +
((unsigned long)cachep) % REAPTIMEOUT_NODE;
n->shared = new_shared;
n->alien = new_alien;
n->free_limit = (1 + nr_cpus_node(node)) *
cachep->batchcount + cachep->num;
cachep->node[node] = n;---------------(6)
}
return 0;
fail:
if (!cachep->list.next) {
/* Cache is not active yet. Roll back what we did */
node--;
while (node >= 0) {
n = get_node(cachep, node);
if (n) {
kfree(n->shared);
free_alien_cache(n->alien);
kfree(n);
cachep->node[node] = NULL;
}
node--;
}
}
return -ENOMEM;
}
(1)UMA系统只有一个node
(2)如果需要CPU之间建立共享对象缓冲池,则创建一个对象缓冲池用来初始化slab节点的shared链表
(3)获取slab节点
(4)没有获取到slab节点,则分配一个slab节点结构体
(5)必要的初始化slab节点结构体
(6)将slab节点赋值给slab描述符,作为此描述符专有的节点。
到此slab描述符的创建工作就已经完成,他主要完成了slab描述符的初始化,CPU本地对象缓冲池结构体的初始化,slab节点的初始化。