slab分配器的基本思想是:将内核中经常使用的对象放到高速缓存中,并且由系统保持为初始的可利用状态。如果没有基于对象的分配器,内核将花费更多的时间分配,初始化以及释放一个对象。slab分配器的目的是缓存这些空闲的对象,保留基本结构,从而能够重复使用它们。
slab分配器由多个高速缓存组成,它们通过双向循环链表链接在一起,称为高速缓存链表。每个高速缓存包含若干slab,slab由连续的页面帧组成,它们被划分成许多小的块以存放由高速缓存所管理的数据结构和对象。
slab分配器有三个基本目标:
- 减少伙伴系统分配小块内存时所产生的内部碎片。
- 把经常使用的对象缓存起来,减少分配,初始化以及释放对象的时间开销。
- 调整对象以更好地使用L1和L2硬件高速缓存。
1 slab分配器
1.1 slab描述符
高速缓存描述符用结构体 struct kmem_cache定义,在<linux/slab_def.h>中定义如下:
struct kmem_cache {
struct array_cache __percpu *cpu_cache;
/* 1) Cache tunables. Protected by slab_mutex */
unsigned int batchcount;
unsigned int limit;
unsigned int shared;
unsigned int size;
struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */
unsigned int flags; /* constant flags */
unsigned int num; /* # of objs per slab */
/* 3) cache_grow/shrink */
/* order of pgs per slab (2^n) */
unsigned int gfporder;
/* force GFP flags, e.g. GFP_DMA */
gfp_t allocflags;
size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
struct kmem_cache *freelist_cache;
unsigned int freelist_size;
/* constructor func */
void (*ctor)(void *obj);
/* 4) cache creation/removal */
const char *name;
struct list_head list;
int refcount;
int object_size;/
int align;
/* 5) statistics */
#ifdef CONFIG_DEBUG_SLAB
unsigned long num_active;
unsigned long num_allocations;
unsigned long high_mark;
unsigned long grown;
unsigned long reaped;
unsigned long errors;
unsigned long max_freeable;
unsigned long node_allocs;
unsigned long node_frees;
unsigned long node_overflow;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
atomic_t freemiss;
/*
* If debugging is enabled, then the allocator can add additional
* fields and/or padding to every object. size contains the total
* object size including these internal fields, the following two
* variables contain the offset to the user object and its size.
*/
int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
#ifdef CONFIG_MEMCG_KMEM
struct memcg_cache_params memcg_params;
#endif
struct kmem_cache_node *node[MAX_NUMNODES];
};
下面对高速缓存描述符的各个字段进行解释:
- cpu_cache:由结构体struct array_cache定义,本地CPU的对象缓冲池,可以加快每个CPU上的分配过程。struct array_cache在<kernel/mm/slab.c>中定义如下:
/*
* struct array_cache
*
* Purpose:
* - LIFO ordering, to hand out cache-warm objects from _alloc
* - reduce the number of linked list operations
* - reduce spinlock operations
*
* The limit is stored in the per-cpu structure to reduce the data cache
* footprint.
*
*/
struct array_cache {
/* 可用数量 */
unsigned int avail;
/* 最大对象数,多余的放回slab */
unsigned int limit;
/* 每次在slab缓存和slab中移入移出的数量 */
unsigned int batchcount;
/* 是否从SLAB缓冲池中分配过内存,从缓冲池移除一个对象时,
* 将touched置1,收缩缓存时,将touched置0
*/
unsigned int touched;
//slab对象地址
void *entry[]; /*
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
*
* Entries should not be directly dereferenced as
* entries belonging to slabs marked pfmemalloc will
* have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
- batchcount:由slab_mutex保护,当CPU的本地对象缓冲池array_cache为空时,从共享的缓冲池或者slabs_partial、slabs_free列表中获取对象的数目。
- limit:高速缓存中缓存对象数量限制,当本地对象缓冲池的空闲对象数目大于limit时,就会主动释放batchcount个对象,便于内核回收和销毁slab。
- shared:用于SMP,在所有CPU之间灵活机动共享的SLAB个数
- size:slab对象长度大小,不含对齐所需补充的字节
- reciprocal_buffer_size:加快内部计算过程用的临时变量
- flags:描述slab对象属性的标志.
- num:每个slab中的对象个数
- gfporder:扩充或者收缩slab时,分配或者释放的内存order.每个slab占用2^gfporder个连续页面,分配时由伙伴系统提供。
- allocflags:分配内存的GFP标志
- colour:着色范围,一个slab中有几个不同的cache line。
- colour_off:两次着色之间的差,根据缓存行大小和对象对齐大小来计算
- freelist_cache:当kmem_cache位于slab外面时,预先分配的kmem_cache对象指针及其数量
- freelist_size:每个对象要占用1Byte来存放freelist。
- ctor:分配slab时的回调
- name:该高速缓存的名称,可以在proc/slabinfo中查看
- list:通过此字段将对象链接到cache_chain链表
- refcount:引用计数
- object_size:slab对象大小,含对齐所需字节
- align:对齐长度
- node:按节点管理的kmem_cache_node描述符。
1.2 创建slab描述符
函数kmem_cache_create用于创建slab描述符,kmem_cache_create在<mm/slab_comm.c>中实现如下:
struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
const char *cache_name;
int err;
get_online_cpus();
get_online_mems();
memcg_get_cache_ids();
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, size);
if (err) {
s = NULL; /* suppress uninit var warning */
goto out_unlock;
}
/*
* Some allocators will constraint the set of valid flags to a subset
* of all flags. We expect them to define CACHE_CREATE_MASK in this
* case, and we'll just provide them with a sanitized version of the
* passed flags.
*/
flags &= CACHE_CREATE_MASK;
s = __kmem_cache_alias(name, size, align, flags, ctor);
if (s)
goto out_unlock;
cache_name = kstrdup_const(name, GFP_KERNEL);
if (!cache_name) {
err = -ENOMEM;
goto out_unlock;
}
s = do_kmem_cache_create(cache_name, size, size,
calculate_alignment(flags, align, size),
flags, ctor, NULL, NULL);
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
}
out_unlock:
mutex_unlock(&slab_mutex);
memcg_put_cache_ids();
put_online_mems();
put_online_cpus();
if (err) {
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
else {
printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
name, err);
dump_stack();
}
return NULL;
}
return s;
}
kmem_cache_create参数:
- name:slab描述符名称
- size:缓存对象大小
- align:缓存对象需要对齐的字节数
- flags:分配掩码
- ctor:对象的构造函数
在kmem_cache_create中,函数kmem_cache_sanity_check检查是否已经创建了同名的slab,如果没有创建过同名的slab,就调用函数__kmem_cache_alias查找是否有现成的slab描述符可以复用,若没有就调用do_kmem_cache_create来创建一个新的slab描述符。do_kmem_cache_create的实现如下:
static struct kmem_cache *
do_kmem_cache_create(const char *name, size_t object_size, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *),
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
struct kmem_cache *s;
int err;
err = -ENOMEM;
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
if (!s)
goto out;
s->name = name;
s->object_size = object_size;
s->size = size;
s->align = align;
s->ctor = ctor;
err = init_memcg_params(s, memcg, root_cache);
if (err)
goto out_free_cache;
err = __kmem_cache_create(s, flags);
if (err)
goto out_free_cache;
s->refcount = 1;
list_add(&s->list, &slab_caches);
out:
if (err)
return ERR_PTR(err);
return s;
out_free_cache:
destroy_memcg_params(s);
kmem_cache_free(kmem_cache, s);
goto out;
}
do_kmem_cache_create调用kmem_cache_zalloc分配一个slab描述符,然后调用__kmem_cache_create创建一个slab缓存,创建成功后将新创建的slab描述符加入全局slab_caches中。
__kmem_cache_create的实现如下:
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
size_t left_over, freelist_size;
size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
size_t size = cachep->size;
/*
* Check that size is in terms of words. This is needed to avoid
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/
if (size & (BYTES_PER_WORD - 1)) {
size += (BYTES_PER_WORD - 1);
size &= ~(BYTES_PER_WORD - 1);
}
if (flags & SLAB_RED_ZONE) {
ralign = REDZONE_ALIGN;
/* If redzoning, ensure that the second redzone is suitably
* aligned, by adjusting the object size accordingly. */
size += REDZONE_ALIGN - 1;
size &= ~(REDZONE_ALIGN - 1);
}
/* 3) caller mandated alignment */
if (ralign < cachep->align) {
ralign = cachep->align;
}
/* disable debug if necessary */
if (ralign > __alignof__(unsigned long long))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
*/
cachep->align = ralign;
if (slab_is_available())
gfp = GFP_KERNEL;
else
gfp = GFP_NOWAIT;
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
* it too early on. Always use on-slab management when
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&/* SLAB对象较大 */
!(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/
flags |= CFLGS_OFF_SLAB;
size = ALIGN(size, cachep->align);
/*
* We should restrict the number of objects in a slab to implement
* byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
*/
if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
left_over = calculate_slab_order(cachep, size, cachep->align, flags);
if (!cachep->num)
return -E2BIG;
freelist_size = calculate_freelist_size(cachep->num, cachep->align);
/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/
if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
flags &= ~CFLGS_OFF_SLAB;
left_over -= freelist_size;
}
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
freelist_size = calculate_freelist_size(cachep->num, 0);
#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
* poisoning, then it's going to smash the contents of
* the redzone and userword anyhow, so switch them off.
*/
if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
}
cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
if (cachep->colour_off < cachep->align)
cachep->colour_off = cachep->align;
cachep->colour = left_over / cachep->colour_off;
cachep->freelist_size = freelist_size;
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
cachep->allocflags |= GFP_DMA;
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
if (flags & CFLGS_OFF_SLAB) {
cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
/*
* This is a possibility for one of the kmalloc_{dma,}_caches.
* But since we go off slab only for object size greater than
* PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
* in ascending order,this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
}
err = setup_cpu_cache(cachep, gfp);
if (err) {
__kmem_cache_shutdown(cachep);
return err;
}
return 0;
}
__kmem_cache_create中,函数calculate_slab_order用于计算页面分配的order数量,可以容纳多少个obj对象,及每个SLAB中的对象容量,剩下的空间用于cache colour着色。calculate_slab_order的实现如下:
static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, size_t align, unsigned long flags)
{
unsigned long offslab_limit;
size_t left_over = 0;
int gfporder;
for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
unsigned int num;
size_t remainder;
cache_estimate(gfporder, size, align, flags, &remainder, &num);
if (!num)
continue;
/* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
if (num > SLAB_OBJ_MAX_NUM)
break;
if (flags & CFLGS_OFF_SLAB) {
size_t freelist_size_per_obj = sizeof(freelist_idx_t);
/*
* Max number of objs-per-slab for caches which
* use off-slab slabs. Needed to avoid a possible
* looping condition in cache_grow().
*/
if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
freelist_size_per_obj += sizeof(char);
offslab_limit = size;
offslab_limit /= freelist_size_per_obj;
if (num > offslab_limit)
break;
}
/* Found something acceptable - save it away */
cachep->num = num;
cachep->gfporder = gfporder;
left_over = remainder;
/*
* A VFS-reclaimable slab tends to have most allocations
* as GFP_NOFS and we really don't want to have to be allocating
* higher-order pages when we are unable to shrink dcache.
*/
if (flags & SLAB_RECLAIM_ACCOUNT)
break;
/*
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/
if (gfporder >= slab_max_order)
break;
/*
* Acceptable internal fragmentation?
*/
if (left_over * 8 <= (PAGE_SIZE << gfporder))
break;
}
return left_over;
}
回到__kmem_cache_create中,函数cache_line_size()计算L1 cache行的大小,用left_over除以L1 cache行的大小,就计算出了该slab空闲空间可以容纳多少次着色。
最后,__kmem_cache_create调用setup_cpu_cache函数来继续配置slab描述符。setup_cpu_cache函数实现如下:
static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
if (slab_state >= FULL)
return enable_cpucache(cachep, gfp);
cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
if (!cachep->cpu_cache)
return 1;
if (slab_state == DOWN) {
/* Creation of first cache (kmem_cache). */
set_up_node(kmem_cache, CACHE_CACHE);
} else if (slab_state == PARTIAL) {
/* For kmem_cache_node */
set_up_node(cachep, SIZE_NODE);
} else {
int node;
for_each_online_node(node) {
cachep->node[node] = kmalloc_node(
sizeof(struct kmem_cache_node), gfp, node);
BUG_ON(!cachep->node[node]);
kmem_cache_node_init(cachep->node[node]);
}
}
cachep->node[numa_mem_id()]->next_reap =
jiffies + REAPTIMEOUT_NODE +
((unsigned long)cachep) % REAPTIMEOUT_NODE;
cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
cpu_cache_get(cachep)->batchcount = 1;
cpu_cache_get(cachep)->touched = 0;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
return 0;
}
setup_cpu_cache中,如果slab_state为FULL,即slab已经初始化完成了,就调用enable_cpucache直接分配内存并初始化。
1.3 分配slab对象
kmem_cache_alloc()函数是分配slab缓存对象的核心函数,在slab分配过程中,全程关闭本地中断。kmem_cache_alloc()在<mm/slab.c>中的实现如下:
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *ret = slab_alloc(cachep, flags, _RET_IP_);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
return ret;
}
kmem_cache_alloc()通过调用slab_alloc()函数进行分配,slab_alloc()函数实现如下:
static __always_inline void *
slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
unsigned long save_flags;
void *objp;
flags &= gfp_allowed_mask;
lockdep_trace_alloc(flags);
if (slab_should_failslab(cachep, flags))
return NULL;
cachep = memcg_kmem_get_cache(cachep, flags);
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
flags);
prefetchw(objp);
if (likely(objp)) {
kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
if (unlikely(flags & __GFP_ZERO))
memset(objp, 0, cachep->object_size);
}
memcg_kmem_put_cache(cachep);
return objp;
}
函数slab_alloc()关闭中断后调用函数__do_cache_alloc()进行slab分配,函数__do_cache_alloc()内部调用____cache_alloc()进行slab分配。____cache_alloc()是slab分配的核心函数,如下所示:
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
bool force_refill = false;
check_irq_off();
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
ac->touched = 1;
objp = ac_get_obj(cachep, ac, flags, false);
/*
* Allow for the possibility all avail objects are not allowed
* by the current flags
*/
if (objp) {
STATS_INC_ALLOCHIT(cachep);
goto out;
}
force_refill = true;
}
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags, force_refill);
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
*/
ac = cpu_cache_get(cachep);
out:
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
* treat the array pointers as a reference to the object.
*/
if (objp)
kmemleak_erase(&ac->entry[ac->avail]);
return objp;
}
____cache_alloc()中,如果ac->avail>0,说明有本地缓冲池对象,直接调用ac_get_obj()从缓冲池中分配;如果ac->avail=0,说明本地缓冲为空,调用cache_alloc_refill()填充本地缓冲,并分配,cache_alloc_refill()的代码如下:
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
bool force_refill)
{
int batchcount;
struct kmem_cache_node *n;
struct array_cache *ac;
int node;
check_irq_off();
node = numa_mem_id();
if (unlikely(force_refill))
goto force_grow;
retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT;
}
n = get_node(cachep, node);
BUG_ON(ac->avail > 0 || !n);
spin_lock(&n->list_lock);
/* See if we can refill from the shared array */
if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
n->shared->touched = 1;
goto alloc_done;
}
while (batchcount > 0) {
struct list_head *entry;
struct page *page;
/* Get slab alloc is to come from. */
entry = n->slabs_partial.next;
if (entry == &n->slabs_partial) {
n->free_touched = 1;
entry = n->slabs_free.next;
if (entry == &n->slabs_free)
goto must_grow;
}
page = list_entry(entry, struct page, lru);
check_spinlock_acquired(cachep);
/*
* The slab was either on partial or free list so
* there must be at least one object available for
* allocation.
*/
BUG_ON(page->active >= cachep->num);
while (page->active < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
node));
}
/* move slabp to correct slabp list: */
list_del(&page->lru);
if (page->active == cachep->num)
list_add(&page->lru, &n->slabs_full);
else
list_add(&page->lru, &n->slabs_partial);
}
must_grow:
n->free_objects -= ac->avail;
alloc_done:
spin_unlock(&n->list_lock);
if (unlikely(!ac->avail)) {
int x;
force_grow:
x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
node = numa_mem_id();
/* no objects in sight? abort */
if (!x && (ac->avail == 0 || force_refill))
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
}
ac->touched = 1;
return ac_get_obj(cachep, ac, flags, force_refill);
}
cache_alloc_refill()中,如果有共享缓冲对象,就调用transfer_objects()从共享缓冲池中转移部分节点到本地缓冲区;如果共享池中也没有空闲对象了,就去查看节点中的slabs_partial链表和slabs_free链表。如果slabs_partial链表或slabs_free链表不为空,就从链表中取出一个成员slab,通过slab_get_obj()函数获取对象的地址,然后通过ac_put_obj()函数把对象迁移到本地对象缓冲池ac中,最后视情况将其放入partial或者full链表中;如果slabs_partial链表或slabs_free链表都为空,说明整个节点都没有空闲对象,这时需要调用cache_grow()重新分配slab。cache_grow()的代码如下:
/*
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
static int cache_grow(struct kmem_cache *cachep,
gfp_t flags, int nodeid, struct page *page)
{
void *freelist;
size_t offset;
gfp_t local_flags;
struct kmem_cache_node *n;
/*
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
BUG();
}
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
/* Take the node list lock to change the colour_next on this node */
check_irq_off();
n = get_node(cachep, nodeid);
spin_lock(&n->list_lock);
/* Get colour for the slab, and cal the next value. */
offset = n->colour_next;
n->colour_next++;
if (n->colour_next >= cachep->colour)
n->colour_next = 0;
spin_unlock(&n->list_lock);
offset *= cachep->colour_off;
if (local_flags & __GFP_WAIT)
local_irq_enable();
/*
* The test for missing atomic flag is performed here, rather than
* the more obvious place, simply to reduce the critical path length
* in kmem_cache_alloc(). If a caller is seriously mis-behaving they
* will eventually be caught here (where it matters).
*/
kmem_flagcheck(cachep, flags);
/*
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
if (!page)
page = kmem_getpages(cachep, local_flags, nodeid);
if (!page)
goto failed;
/* Get slab management. */
freelist = alloc_slabmgmt(cachep, page, offset,
local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
if (!freelist)
goto opps1;
slab_map_pages(cachep, page, freelist);
cache_init_objs(cachep, page);
if (local_flags & __GFP_WAIT)
local_irq_disable();
check_irq_off();
spin_lock(&n->list_lock);
/* Make slab active. */
list_add_tail(&page->lru, &(n->slabs_free));
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num;
spin_unlock(&n->list_lock);
return 1;
opps1:
kmem_freepages(cachep, page);
failed:
if (local_flags & __GFP_WAIT)
local_irq_disable();
return 0;
}
1.4 释放slab对象
函数kmem_cache_free()用于释放slab缓存对象。代码如下:
/**
* kmem_cache_free - Deallocate an object
* @cachep: The cache the allocation was from.
* @objp: The previously allocated object.
*
* Free an object which was previously allocated from this
* cache.
*/
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
unsigned long flags;
cachep = cache_from_obj(cachep, objp);
if (!cachep)
return;
local_irq_save(flags);
debug_check_no_locks_freed(objp, cachep->object_size);
if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(objp, cachep->object_size);
__cache_free(cachep, objp, _RET_IP_);
local_irq_restore(flags);
trace_kmem_cache_free(_RET_IP_, objp);
}
kmem_cache_free()中cache_from_obj()通过要释放对象obj的虚拟地址找到对应的slab描述符struct kmem_cache结构。__cache_free()是释放SLAB对象的主函数,如下所示:
/*
* Release an obj back to its cache. If the obj has a constructed state, it must
* be in this state _before_ it is released. Called with disabled ints.
*/
static inline void __cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
kmemcheck_slab_free(cachep, objp, cachep->object_size);
/*
* Skip calling cache_free_alien() when the platform is not numa.
* This will avoid cache misses that happen while accessing slabp (which
* is per page memory reference) to get nodeid. Instead use a global
* variable to skip the call, which is mostly likely to be present in
* the cache.
*/
if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
return;
if (ac->avail < ac->limit) {
STATS_INC_FREEHIT(cachep);
} else {
STATS_INC_FREEMISS(cachep);
cache_flusharray(cachep, ac);
}
ac_put_obj(cachep, ac, objp);
}
__cache_free()中如果ac->avail 大于 ac->limit,表示本地缓存太多,调用cache_flusharray()函数去尝试回收空闲对象。cache_flusharray()代码如下:
static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
int batchcount;
struct kmem_cache_node *n;
int node = numa_mem_id();
LIST_HEAD(list);
batchcount = ac->batchcount;
#if DEBUG
BUG_ON(!batchcount || batchcount > ac->avail);
#endif
check_irq_off();
n = get_node(cachep, node);
spin_lock(&n->list_lock);
if (n->shared) {
struct array_cache *shared_array = n->shared;
int max = shared_array->limit - shared_array->avail;
if (max) {
if (batchcount > max)
batchcount = max;
memcpy(&(shared_array->entry[shared_array->avail]),
ac->entry, sizeof(void *) * batchcount);
shared_array->avail += batchcount;
goto free_done;
}
}
free_block(cachep, ac->entry, batchcount, node, &list);
free_done:
#if STATS
{
int i = 0;
struct list_head *p;
p = n->slabs_free.next;
while (p != &(n->slabs_free)) {
struct page *page;
page = list_entry(p, struct page, lru);
BUG_ON(page->active);
i++;
p = p->next;
}
STATS_SET_FREEABLE(cachep, i);
}
#endif
spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
}
cache_flusharray()中,如果有共享对象缓冲池,且共享池可以容纳部分对象,就把本地对象缓冲池中的空闲对象复制到共享对象缓冲池中。否则调用free_block()将batchcount个对象释放到SLAB页面中。free_block()代码如下:
/*
* Caller needs to acquire correct kmem_cache_node's list_lock
* @list: List of detached free slabs should be freed by caller
*/
static void free_block(struct kmem_cache *cachep, void **objpp,
int nr_objects, int node, struct list_head *list)
{
int i;
struct kmem_cache_node *n = get_node(cachep, node);
for (i = 0; i < nr_objects; i++) {
void *objp;
struct page *page;
clear_obj_pfmemalloc(&objpp[i]);
objp = objpp[i];
page = virt_to_head_page(objp);
list_del(&page->lru);
check_spinlock_acquired_node(cachep, node);
slab_put_obj(cachep, page, objp, node);
STATS_DEC_ACTIVE(cachep);
n->free_objects++;
/* fixup slab chains */
if (page->active == 0) {
if (n->free_objects > n->free_limit) {
n->free_objects -= cachep->num;
list_add_tail(&page->lru, list);
} else {
list_add(&page->lru, &n->slabs_free);
}
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
list_add_tail(&page->lru, &n->slabs_partial);
}
}
}
最后,cache_flusharray()调用slabs_destroy()来销毁slab。