kmem_cache中对于每CPU都有一个array_cache,已作为每CPU申请内存的缓存. 此函数的目的在于:每个kmem_cache都有一个kmem_cache_node实例,该实例的shared可以供一个kmem_cache上所有CPU的内存申请缓存. 但是上一篇文章中对于kmem_cache中array_cache的值初始化体现不出缓存思想,而且对于kmem_cache中的kmem_cache_node.shared也没有利用.kmem_cache_init_late的目的就在于完善slab分配器的缓存机制.。
1 Shared cache的作用是什么?
shared cache实际上充当了cpu_cache与slab之间的缓存。当cpu_cache中的object用完了,可以先从shared cache中获取object,如果shared object中没有object,才会到slab中申请object;同样,当释放object时,如果cpu_cache已满,则会将object释放到shared cache中,如果shared cache也满了,才会放到slab中。
shared cache机制有以下几点好处:
1 将object释放到shared cache中要比将object释放到slab中速度快;同样,从shared cache中申请object,也比从slab中申请object速度快。所以shared cache加快了申请和释放内存的速度。
2 增加了硬件cache命中的几率。短时间内连续申请和释放同样大小的object在系统上是非常常见的,这种场景下object被释放到shared cache中在不久的将来又会从shared cache中获取该object,该object在硬件cache中已经被驱逐的概率比较小,命中的概率比较大;而如果object被释放到slab中,则下次很可能从其他slab中分配object,前面的object被驱逐的概率增加,同样新申请的object不命令的概率也增加。
2 shared cache初始化
start_kernel
--------->kmem_cache_init_late
void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;
slab_state = UP;
/* 6) resize the head arrays to their final sizes */
mutex_lock(&slab_mutex);
list_for_each_entry(cachep, &slab_caches, list)//遍历之前已经创建的kmem_cache,都挂载在slab_caches链表上
if (enable_cpucache(cachep, GFP_NOWAIT))//enable_cpucache会计算kmem cache实例的shared的值,为shared cache分配空间
BUG();
mutex_unlock(&slab_mutex);
/* Annotate slab for lockdep -- annotate the malloc caches */
init_lock_keys();
/* Done! */
slab_state = FULL;
/*
* Register a cpu startup notifier callback that initializes
* cpu_cache_get for all new cpus
*/
register_cpu_notifier(&cpucache_notifier);
#ifdef CONFIG_NUMA
/*
* Register a memory hotplug callback that initializes and frees
* node.
*/
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
#endif
/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
*/
}
上面函数对所有已经创建的kmem cache做cpucache使能,调用接口enable_cpucache。主要分析enable_cpucache:
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
int limit = 0;
int shared = 0;
int batchcount = 0;
if (!is_root_cache(cachep)) {
struct kmem_cache *root = memcg_root_cache(cachep);
limit = root->limit;
shared = root->shared;
batchcount = root->batchcount;
}
if (limit && shared && batchcount)
goto skip_setup;
/*
* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
* - reduce the number of spinlock operations.
* - reduce the number of linked list operations on the slab and
* bufctl chains: array operations are cheaper.
* The numbers are guessed, we should auto-tune as described by
* Bonwick.
*/
if (cachep->size > 131072)
limit = 1;
else if (cachep->size > PAGE_SIZE)
limit = 8;
else if (cachep->size > 1024)
limit = 24;
else if (cachep->size > 256)
limit = 54;
else
limit = 120;
/*
* CPU bound tasks (e.g. network routing) can exhibit cpu bound
* allocation behaviour: Most allocs on one cpu, most free operations
* on another cpu. For these cases, an efficient object passing between
* cpus is necessary. This is provided by a shared array. The array
* replaces Bonwick's magazine layer.
* On uniprocessor, it's functionally equivalent (but less efficient)
* to a larger limit. Thus disabled by default.
*/
shared = 0;
if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
shared = 8;
//当cachep->size,并且cpu不止一个时,才会设置shared,shared 只在多核cpu上才生效
#if DEBUG
/*
* With debugging enabled, large batchcount lead to excessively long
* periods with disabled local interrupts. Limit the batchcount
*/
if (limit > 32)
limit = 32;
#endif
batchcount = (limit + 1) / 2;
skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err);
return err;
}
enable_cpucache会计算kmem cache实例的shared的值,shared*batchcount是shared cache中可以缓存的object 的最大数量,分析do_tune_cpucache:
do_tune_cpucache
-------------->__do_tune_cpucache
static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
struct ccupdate_struct *new;
int i;
//申请一个临时结构ccupdate_struct ,用来存放新申请的arraycache
new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
gfp);
if (!new)
return -ENOMEM;
for_each_online_cpu(i) {
new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
batchcount, gfp);
//根据limit, batchcount数值,构建新的array_cache实例.
//因为kmem_cache中的array_cache是每个CPU的,所以此处是循环,为每个CPU都
//都构建一个array_cache实例.
if (!new->new[i]) {
for (i--; i >= 0; i--)
kfree(new->new[i]);
kfree(new);
return -ENOMEM;
}
}
new->cachep = cachep;
on_each_cpu(do_ccupdate_local, (void *)new, 1);
//将kmem_cache下的每个CPU的array_cache[i]更换成new->new[i];
check_irq_on();
cachep->batchcount = batchcount;
cachep->limit = limit;
cachep->shared = shared;
//上面以替换了kmem_cache下的每个CPU的array_cache[i],
//因此需要把原来的array_cache释放掉.
for_each_online_cpu(i) {
struct array_cache *ccold = new->new[i];
if (!ccold)
continue;
spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
// 此函数,就是把ccold->avail个ccole->entry中的数组元素指向的内存空间
//释放给slab管理器.不过在这边ccold->avail为0
spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
kfree(ccold);
//基本同于free_block,我们知道slab所管理的内存都是位于低端内存,低端内存的物
// 理地址及其对应的虚拟地址存在固定偏移,因此根据该部分的虚拟地址可以很容易的找到
// struct page实例,而struct page中的lru链表,在slab中被复用了,根据链表
// 指针可以找到kmem_cache实例,所以kfree基本等同于free_block;
// 但是kfree与free_block的重要的不同点在于,free_block直接将内存释放给了
// slab管理器,而kfree首选将内存释放给每CPU的array_cache数组.
}
kfree(new);
return alloc_kmemlist(cachep, gfp);
}
1 先看一下alloc_arraycache,分配一个array_cache结构,并对该结构进行初始化
static struct array_cache *alloc_arraycache(int node, int entries,
int batchcount, gfp_t gfp)
{
int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
struct array_cache *nc = NULL;
nc = kmalloc_node(memsize, gfp, node);
/*
* The array_cache structures contain pointers to free object.
* However, when such objects are allocated or transferred to another
* cache the pointers are not cleared and they could be counted as
* valid references during a kmemleak scan. Therefore, kmemleak must
* not scan such objects.
*/
kmemleak_no_scan(nc);
if (nc) {
nc->avail = 0;
nc->limit = entries;
nc->batchcount = batchcount;
nc->touched = 0;
spin_lock_init(&nc->lock);
}
return nc;
}
2 再看一下这个函数:on_each_cpu(do_ccupdate_local, (void *)new, 1);,真正调用的是这个do_ccupdate_local:
static void do_ccupdate_local(void *info)
{
struct ccupdate_struct *new = info;
struct array_cache *old;
check_irq_off();
old = cpu_cache_get(new->cachep);
new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
new->new[smp_processor_id()] = old;
}
该函数的作用是把新分配的array_cache结构替换原来kmem_cache上的array_cache结构。
然后调用free_block则是要把原来array_cache上的内存释放到slab管理器,看一下具体实现:
3 free_block
static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
int node)
{
int i;
struct kmem_cache_node *n;
for (i = 0; i < nr_objects; i++) {
void *objp;
struct slab *slabp;
clear_obj_pfmemalloc(&objpp[i]);
objp = objpp[i];
slabp = virt_to_slab(objp);//一般slab空间都不会超过一个页,所以对其以后就能找到其虚拟地址页框,接着根据页框索引找到其page管理结构,由于page中又又结构指向slab,所以最终取到slabp
n = cachep->node[node];
list_del(&slabp->list);//从kmem_cache_node的某个链表上取下slab
check_spinlock_acquired_node(cachep, node);
check_slabp(cachep, slabp);
slab_put_obj(cachep, slabp, objp, node);//把要释放的obj放回slab管理器
STATS_DEC_ACTIVE(cachep);
n->free_objects++;//增加空闲数量统计
check_slabp(cachep, slabp);
/* fixup slab chains */
if (slabp->inuse == 0) { //如果slab中内存全部是空闲
if (n->free_objects > n->free_limit) {
//如果空闲内存数量大于设定的值,则需要向伙伴系统释放掉一些内存
n->free_objects -= cachep->num;
/* No need to drop any previously held
* lock here, even if we have a off-slab slab
* descriptor it is guaranteed to come from
* a different cache, refer to comments before
* alloc_slabmgmt.
*/
slab_destroy(cachep, slabp);
} else {
list_add(&slabp->list, &n->slabs_free);//把该slab放到空闲链表
}
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
//不全部空闲,则放到slabs_partial链表
list_add_tail(&slabp->list, &n->slabs_partial);
}
}
}
4 再看一下kfree
void kfree(const void *objp)
{
struct kmem_cache *c;
unsigned long flags;
trace_kfree(_RET_IP_, objp);
if (unlikely(ZERO_OR_NULL_PTR(objp)))
return;
local_irq_save(flags);
kfree_debugcheck(objp);
c = virt_to_cache(objp);
debug_check_no_locks_freed(objp, c->object_size);
debug_check_no_obj_freed(objp, c->object_size);
__cache_free(c, (void *)objp, _RET_IP_);
local_irq_restore(flags);
}
核心函数是__cache_free:
static inline void __cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
kmemcheck_slab_free(cachep, objp, cachep->object_size);
/*
* Skip calling cache_free_alien() when the platform is not numa.
* This will avoid cache misses that happen while accessing slabp (which
* is per page memory reference) to get nodeid. Instead use a global
* variable to skip the call, which is mostly likely to be present in
* the cache.
*/
if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
return;
if (likely(ac->avail < ac->limit)) {
STATS_INC_FREEHIT(cachep);
} else {
STATS_INC_FREEMISS(cachep);
cache_flusharray(cachep, ac);//如果当前cpu array_cache中空闲内存过多,就释放一部分内存到shared array_cache中
}
ac_put_obj(cachep, ac, objp);//把要释放的内存放入当前cpu array_cache
}
5 alloc_kmemlist 为kmem_cache申请shared memory
static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
{
int node;
struct kmem_cache_node *n;
struct array_cache *new_shared;
struct array_cache **new_alien = NULL;
for_each_online_node(node) {
if (use_alien_caches) {
new_alien = alloc_alien_cache(node, cachep->limit, gfp);
if (!new_alien)
goto fail;
}
new_shared = NULL;
if (cachep->shared) {// 为node 分配shared array_cache结构
new_shared = alloc_arraycache(node,
cachep->shared*cachep->batchcount,
0xbaadf00d, gfp);
if (!new_shared) {
free_alien_cache(new_alien);
goto fail;
}
}
n = cachep->node[node];
if (n) {
struct array_cache *shared = n->shared;
spin_lock_irq(&n->list_lock);
if (shared)
free_block(cachep, shared->entry,
shared->avail, node);
n->shared = new_shared;
if (!n->alien) {
n->alien = new_alien;
new_alien = NULL;
}
n->free_limit = (1 + nr_cpus_node(node)) *
cachep->batchcount + cachep->num;
spin_unlock_irq(&n->list_lock);
kfree(shared);
free_alien_cache(new_alien);
continue;
}
n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
if (!n) {
free_alien_cache(new_alien);
kfree(new_shared);
goto fail;
}
kmem_cache_node_init(n);
n->next_reap = jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
n->shared = new_shared;
n->alien = new_alien;
n->free_limit = (1 + nr_cpus_node(node)) *
cachep->batchcount + cachep->num;
cachep->node[node] = n;
}
return 0;
fail:
if (!cachep->list.next) {
/* Cache is not active yet. Roll back what we did */
node--;
while (node >= 0) {
if (cachep->node[node]) {
n = cachep->node[node];
kfree(n->shared);
free_alien_cache(n->alien);
kfree(n);
cachep->node[node] = NULL;
}
node--;
}
}
return -ENOMEM;
}
该函数的主要作用是对于当前kmem_cache,为每个kmem_cache_node分配一个shared array_cache结构。
至此,初始化完毕。
3 从 shared cache中分配和释放object
3.1 释放object到shared cache
__cache_free中如果cpu_cache已满,则调用cache_flusharray将cpu cache中最多batchcount个object释放到到shared cache中:
__cache_free
---------->cache_flusharray
static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
{
int batchcount;
struct kmem_cache_node *n;
int node = numa_mem_id();
batchcount = ac->batchcount;
#if DEBUG
BUG_ON(!batchcount || batchcount > ac->avail);
#endif
check_irq_off();
n = cachep->node[node];
spin_lock(&n->list_lock);
if (n->shared) {
struct array_cache *shared_array = n->shared;
int max = shared_array->limit - shared_array->avail;
if (max) {
if (batchcount > max)
batchcount = max;
//把batchcount 个object 复制到shared_array中
memcpy(&(shared_array->entry[shared_array->avail]),
ac->entry, sizeof(void *) * batchcount);
shared_array->avail += batchcount;
goto free_done;
}
}
free_block(cachep, ac->entry, batchcount, node);
free_done:
#if STATS
{
int i = 0;
struct list_head *p;
p = n->slabs_free.next;
while (p != &(n->slabs_free)) {
struct slab *slabp;
slabp = list_entry(p, struct slab, list);
BUG_ON(slabp->inuse);
i++;
p = p->next;
}
STATS_SET_FREEABLE(cachep, i);
}
#endif
spin_unlock(&n->list_lock);
ac->avail -= batchcount; //从当前cpu的array_cache 中减去已经移动的object
//把上面已经复制到shared_array中的object 从当前cpu的array_cache 中移除
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
}
3.2 从shared cache分配object
当cpu cache中没有object可用时,会调用cache_alloc_refill申请object填入cpu cache中。
cache_alloc_refill会判断shared cache是否有object可用,如果有调用transfer_objects将shared cache中的object移入cpu cache中;如果shared cache中没有object可用,则到slab中获取object。
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
bool force_refill)
{
int batchcount;
struct kmem_cache_node *n;
struct array_cache *ac;
int node;
check_irq_off();
node = numa_mem_id();
if (unlikely(force_refill))
goto force_grow;
retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT;
}
n = cachep->node[node];
BUG_ON(ac->avail > 0 || !n);
spin_lock(&n->list_lock);
/* See if we can refill from the shared array */
if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
n->shared->touched = 1;
goto alloc_done;
} //如果shared 存在,则从shared 中移动object到当前cpu的array_cache中
while (batchcount > 0) {
struct list_head *entry;
struct slab *slabp;
/* Get slab alloc is to come from. */
entry = n->slabs_partial.next;
if (entry == &n->slabs_partial) {
n->free_touched = 1;
entry = n->slabs_free.next;
if (entry == &n->slabs_free)
goto must_grow;
}
slabp = list_entry(entry, struct slab, list);
check_slabp(cachep, slabp);
check_spinlock_acquired(cachep);
/*
* The slab was either on partial or free list so
* there must be at least one object available for
* allocation.
*/
BUG_ON(slabp->inuse >= cachep->num);
while (slabp->inuse < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
node));
}
check_slabp(cachep, slabp);
/* move slabp to correct slabp list: */
list_del(&slabp->list);
if (slabp->free == BUFCTL_END)
list_add(&slabp->list, &n->slabs_full);
else
list_add(&slabp->list, &n->slabs_partial);
}
must_grow:
n->free_objects -= ac->avail;
alloc_done:
spin_unlock(&n->list_lock);
if (unlikely(!ac->avail)) {
int x;
force_grow:
x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
node = numa_mem_id();
/* no objects in sight? abort */
if (!x && (ac->avail == 0 || force_refill))
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
}
ac->touched = 1;
//从shared中移动object以后,在这里已经能能从当前cpu的array_cache中分配成功了
return ac_get_obj(cachep, ac, flags, force_refill);
}
看一下transfer_objects:
static int transfer_objects(struct array_cache *to,
struct array_cache *from, unsigned int max)
{
/* Figure out how many entries to transfer */
int nr = min3(from->avail, max, to->limit - to->avail);
if (!nr)
return 0;
memcpy(to->entry + to->avail, from->entry + from->avail -nr,
sizeof(void *) *nr);
from->avail -= nr;
to->avail += nr;
return nr;
}
移动shared中的object到当前cpu的array_cache