Slab的数据结构:
struct slab {
struct list_head list;
unsigned long colouroff;
void *s_mem; /* including colour offset */
unsigned int inuse; /* num of objs active in slab */
kmem_bufctl_t free;
unsigned short nodeid;
};
Kmem_cache的数据结构:
struct kmem_cache {
/* 1) per-cpu data, touched during every alloc/free */
struct array_cache *array[NR_CPUS];
/* 2) Cache tunables. Protected by cache_chain_mutex */
unsigned int batchcount;
unsigned int limit;
unsigned int shared;
unsigned int buffer_size;
/* 3) touched by every alloc & free from the backend */
struct kmem_list3 *nodelists[MAX_NUMNODES];
unsigned int flags; /* constant flags */
unsigned int num; /* # of objs per slab */
/* 4) cache_grow/shrink */
/* order of pgs per slab (2^n) */
unsigned int gfporder;
/* force GFP flags, e.g. GFP_DMA */
gfp_t gfpflags;
size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
struct kmem_cache *slabp_cache;
unsigned int slab_size;
unsigned int dflags; /* dynamic flags */
/* constructor func */
void (*ctor) (void *, struct kmem_cache *, unsigned long);
/* de-constructor func */
void (*dtor) (void *, struct kmem_cache *, unsigned long);
/* 5) cache creation/removal */
const char *name;
struct list_head next;
/* 6) statistics */
#if STATS
unsigned long num_active;
unsigned long num_allocations;
unsigned long high_mark;
unsigned long grown;
unsigned long reaped;
unsigned long errors;
unsigned long max_freeable;
unsigned long node_allocs;
unsigned long node_frees;
unsigned long node_overflow;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
atomic_t freemiss;
#endif
#if DEBUG
/*
* If debugging is enabled, then the allocator can add additional
* fields and/or padding to every object. buffer_size contains the total
* object size including these internal fields, the following two
* variables contain the offset to the user object and its size.
*/
int obj_offset;
int obj_size;
#endif
};
struct kmem_list3 {
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
spinlock_t list_lock;
struct array_cache *shared; /* shared per node */
struct array_cache **alien; /* on other nodes */
unsigned long next_reap; /* updated without locking */
int free_touched; /* updated without locking */
};
Slab的目的是用来做cache的,就是保留一系列内核经常用的数据结构,这些对象(这些数据结构的内核另称)的组织是阶梯形的。
一个slab里面可以保存多个同类型对象,也可以只有一个。一个slab占的内存页的个数也在keme_cache中保存着。
在keme_cache数据结构里保存着关于这种对象的slab的信息。在slab数据结构保存的是这个slab的对象的地址等等。
对于每一种对象,都有一个kmem_cache数据结构与之对应,所有不同型对象的kmem_cache数据结构通过kmem_cache中的next相连,kmem_cache中的struct kmem_list3 *nodelists[MAX_NUMNODES];中保存着这种对象的slab的连接结构。同时数据结构kmem_cache也是通过slab来实现的。Cache_cache保存着kmem_cache的kmem_cache描述表。
另外在数据结构Kmem_cache看到有一项
struct array_cache *array[NR_CPUS];
其为每一个cpu都保留了一个这中对象的cache,这个cache在没有时填补,在多余时释放些。
看几个函数:
里面最核心的函数是static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags,
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
return __cache_alloc(cachep, flags, __builtin_return_address(0));
}
/**
* kmem_cache_alloc_node - Allocate an object on the specified node
* @cachep: The cache to allocate from.
* @flags: See kmalloc().
* @nodeid: node number of the target node.
*
* Identical to kmem_cache_alloc, except that this function is slow
* and can sleep. And it will allocate memory on the given node, which
* can improve the performance for cpu bound structures.
* New and improved: it will now make sure that the object gets
* put on the correct node list so that there is no false sharing.
*/
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
unsigned long save_flags;
void *ptr;
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
if (nodeid == -1 || nodeid == numa_node_id() ||
!cachep->nodelists[nodeid])
ptr = ____cache_alloc(cachep, flags);
else
ptr = __cache_alloc_node(cachep, flags, nodeid);
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
__builtin_return_address(0));
return ptr;
}
static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
gfp_t flags, void *caller)
{
unsigned long save_flags;
void *objp;
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = ____cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp,
caller);
prefetchw(objp);
return objp;
}
这个函数首先得到一个这个cpu的struct array_cache结构,重要的是对这个结构的理解,
struct array_cache {
unsigned int avail; //可用的个数,也是一个索引,每次都是从后往前区。
unsigned int limit;//最大限制,
unsigned int batchcount;//每次增减的个数
unsigned int touched;//最近是否取过?
spinlock_t lock;
void *entry[0];
/*这样定义便是对应的对象的起始地址,因为它定义成了0个,而在 struct array_cache 后面紧跟的便是对象数组。
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
* [0] is for gcc 2.95. It should really be [].
*/
};
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
#ifdef CONFIG_NUMA
if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
objp = alternate_node_alloc(cachep, flags);
if (objp != NULL)
return objp;
}
#endif
check_irq_off();
ac = cpu_cache_get(cachep); //cache->array[smp_processor_id()];
if (likely(ac->avail)) {
STATS_INC_ALLOCHIT(cachep);
ac->touched = 1;
objp = ac->entry[--ac->avail];//从后往前取。
} else {
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags); //见其函数!!
}
return objp;
}
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_list3 *l3;
struct array_cache *ac;
check_irq_off();
ac = cpu_cache_get(cachep); //cache->array[smp_processor_id()];
retry:
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT;
}
//从这种对象的slab中取或从share中取,先得到slab的连接结构
L3 = cachep->nodelists[numa_node_id()];
BUG_ON(ac->avail > 0 || !l3);
spin_lock(&l3->list_lock);
/* See if we can refill from the shared array */
/* l3->shared 指向shared array */
if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
goto alloc_done;
while (batchcount > 0) {
struct list_head *entry;
struct slab *slabp;
/* Get slab alloc is to come from. */
entry = l3->slabs_partial.next;
if (entry == &l3->slabs_partial) {
l3->free_touched = 1;
entry = l3->slabs_free.next;
if (entry == &l3->slabs_free)
goto must_grow;
}
slabp = list_entry(entry, struct slab, list);
check_slabp(cachep, slabp);
check_spinlock_acquired(cachep);
while (slabp->inuse < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
numa_node_id());
}
check_slabp(cachep, slabp);
/* move slabp to correct slabp list: */
list_del(&slabp->list);
if (slabp->free == BUFCTL_END)
list_add(&slabp->list, &l3->slabs_full);
else
list_add(&slabp->list, &l3->slabs_partial);
}
must_grow:
l3->free_objects -= ac->avail;
alloc_done:
spin_unlock(&l3->list_lock);
if (unlikely(!ac->avail)) {
int x;
x = cache_grow(cachep, flags, numa_node_id());
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
if (!x && ac->avail == 0) /* no objects in sight? abort */
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
}
ac->touched = 1;
return ac->entry[--ac->avail];
kmalloc is for general purpose object.
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, void *caller)
{
struct kmem_cache *cachep;
/* If you want to save a few bytes .text space: replace
* __ with kmem_.
* Then kmalloc uses the uninlined functions instead of the inline
* functions.
*/
cachep = __find_general_cachep(size, flags);
if (unlikely(cachep == NULL))
return NULL;
return __cache_alloc(cachep, flags, caller);
}
static inline struct kmem_cache *__find_general_cachep(size_t size,
gfp_t gfpflags)
{
struct cache_sizes *csizep = malloc_sizes;
#if DEBUG
/* This happens if someone tries to call
* kmem_cache_create(), or __kmalloc(), before
* the generic caches are initialized.
*/
BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
#endif
while (size > csizep->cs_size)
csizep++;
/*
* Really subtle: The last entry with cs->cs_size==ULONG_MAX
* has cs_{dma,}cachep==NULL. Thus no special case
* for large kmalloc calls required.
*/
if (unlikely(gfpflags & GFP_DMA))
return csizep->cs_dmacachep;
return csizep->cs_cachep;
}
struct cache_sizes malloc_sizes[] = {
#define CACHE(x) { .cs_size = (x) },
#include <linux/kmalloc_sizes.h>
CACHE(ULONG_MAX)
#undef CACHE
};