在讲初始化之前,先看一下slab机制中用到的主要结构体:
struct kmem_cache {
/* 1) Cache tunables. Protected by cache_chain_mutex */
unsigned int batchcount; 从slab放入cpu array的object个数
unsigned int limit;
unsigned int shared;
unsigned int size;对其后实际分配的object的size的大小
u32 reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */
unsigned int flags; /* constant flags */
unsigned int num; slab中object的数量 /* # of objs per slab */
/* 3) cache_grow/shrink */
/* order of pgs per slab (2^n) */
unsigned int gfporder; 占用的页的个数
/* force GFP flags, e.g. GFP_DMA */
gfp_t allocflags;
size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
struct kmem_cache *slabp_cache;
unsigned int slab_size; slab管理结构的大小
/* constructor func */
void (*ctor)(void *obj);
/* 4) cache creation/removal */
const char *name;
struct list_head list; 用来把多个kmem_cache 串起来
int refcount;
int object_size; 每个object的大小
int align;
/* 5) statistics */
#ifdef CONFIG_DEBUG_SLAB
unsigned long num_active;
unsigned long num_allocations;
unsigned long high_mark;
unsigned long grown;
unsigned long reaped;
unsigned long errors;
unsigned long max_freeable;
unsigned long node_allocs;
unsigned long node_frees;
unsigned long node_overflow;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
atomic_t freemiss;
/*
* If debugging is enabled, then the allocator can add additional
* fields and/or padding to every object. size contains the total
* object size including these internal fields, the following two
* variables contain the offset to the user object and its size.
*/
int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
#ifdef CONFIG_MEMCG_KMEM
struct memcg_cache_params *memcg_params;
#endif
/* 6) per-cpu/per-node data, touched during every alloc/free */
/*
* We put array[] at the end of kmem_cache, because we want to size
* this array to nr_cpu_ids slots instead of NR_CPUS
* (see kmem_cache_init())
* We still use [NR_CPUS] and not [1] or [0] because cache_cache
* is statically defined, so we reserve the max number of cpus.
*
* We also need to guarantee that the list is able to accomodate a
* pointer for each node since "nodelists" uses the remainder of
* available pointers.
*/
struct kmem_cache_node **node;管理slab数据
struct array_cache *array[NR_CPUS + MAX_NUMNODES];
per_cpu数据,记录了本地高速缓存的信息,每次获取object都是通过它
/*
* Do not add fields after array[]
*/
};
每一种内存大小的类型,都用kmem_cache结构来表示,多个kmem_cache又挂载在slab_caches链表上。
struct kmem_cache_node {
spinlock_t list_lock;
#ifdef CONFIG_SLAB
struct list_head slabs_partial; slab中内存部分使用以后,挂载在这个链表
struct list_head slabs_full; slab中内存全部使用以后,挂载在这个链表
struct list_head slabs_free; slab中内存没有使用则挂载在这个链表
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
struct array_cache *shared; /* shared per node */
struct array_cache **alien; /* on other nodes */
unsigned long next_reap; /* updated without locking */
int free_touched; /* updated without locking */
#endif
#ifdef CONFIG_SLUB
unsigned long nr_partial;
struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
atomic_long_t nr_slabs;
atomic_long_t total_objects;
struct list_head full;
#endif
#endif
}
kmem_cache_node是用来管理具体的slab的,根据slab的不同使用情况,挂载不同的链表上面。
struct array_cache {
unsigned int avail; 该缓存里可提供的object数量
unsigned int limit;
unsigned int batchcount; 每次从slab中取object的数量
unsigned int touched;
spinlock_t lock;
void *entry[]; object都放在这边
};
array_cache用来管理高速缓存,从slab系统申请内存的时候,优先从这边分配
struct slab {
union {
struct {
struct list_head list;
unsigned long colouroff;
void *s_mem; 分配的内存的起始地址
unsigned int inuse; /* num of objs active in slab */
kmem_bufctl_t free; 指向下一个空闲的object
unsigned short nodeid;
};
struct slab_rcu __slab_cover_slab_rcu;
};
};
slab是用来管理object的,每次从slab系统申请空间,需要返回object。
系统初始化的时候,会进行如下的调用关系来初始化slab系统:
start_kernel
---------->mm_init
----------->kmem_cache_init
void __init kmem_cache_init(void)
{
int i;
kmem_cache = &kmem_cache_boot; 把静态值赋值给kmem_cache
setup_node_pointer(kmem_cache);初始化kmem_cache node的值为最后一个cpu cache后面的空
间,这样就可以给node指针赋值了
if (num_possible_nodes() == 1)
use_alien_caches = 0;
for (i = 0; i < NUM_INIT_LISTS; i++) 初始化init_kmem_cache_node
kmem_cache_node_init(&init_kmem_cache_node[i]);
set_up_node(kmem_cache, CACHE_CACHE); node[0]指向init_kmem_cache_node[0]
/*
* Fragmentation resistance on low memory - only use bigger
* page orders on machines with more than 32MB of memory if
* not overridden on the command line.
*/
if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
slab_max_order = SLAB_MAX_ORDER_HI;
/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:
* 1) initialize the kmem_cache cache: it contains the struct
* kmem_cache structures of all caches, except kmem_cache itself:
* kmem_cache is statically allocated.
* Initially an __init data area is used for the head array and the
* kmem_cache_node structures, it's replaced with a kmalloc allocated
* array at the end of the bootstrap.
* 2) Create the first kmalloc cache.
* The struct kmem_cache for the new cache is allocated normally.
* An __init data area is used for the head array.
* 3) Create the remaining kmalloc caches, with minimally sized
* head arrays.
* 4) Replace the __init data head arrays for kmem_cache and the first
* kmalloc cache with kmalloc allocated arrays.
* 5) Replace the __init data for kmem_cache_node for kmem_cache and
* the other cache's with kmalloc allocated memory.
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/
/* 1) create the kmem_cache */
/*
* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
*/
为kmem_cache 创建slab,该结构用来为后续的kmem_cache 创建分配空间
create_boot_cache(kmem_cache, "kmem_cache",
offsetof(struct kmem_cache, array[nr_cpu_ids]) +
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN);
然后把kmem_cache挂载到slab_caches,所有的kmem_cache都会挂载到slab_caches
list_add(&kmem_cache->list, &slab_caches);
/* 2+3) create the kmalloc caches */
/*
* Initialize the caches that provide memory for the array cache and the
* kmem_cache_node structures first. Without this, further allocations will
* bug.
*/
创建一个名字叫kmalloc-ac的kmem_cache,并且为其创建slab,该结构用来为后续的array_cache
分配空间
kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
创建一个名字叫kmalloc-node的kmem_cache,并且为其创建slab,该结构用来为后续的
kmem_cache_node分配空间
if (INDEX_AC != INDEX_NODE)
kmalloc_caches[INDEX_NODE] =
create_kmalloc_cache("kmalloc-node",
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
slab_early_init = 0;
/* 4) Replace the bootstrap head arrays */
{
struct array_cache *ptr;
向slab系统申请arraycache_init,用来取代之前使用的静态的arraycache_init
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
memcpy(ptr, cpu_cache_get(kmem_cache),
sizeof(struct arraycache_init));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
spin_lock_init(&ptr->lock);
kmem_cache->array[smp_processor_id()] = ptr;
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
!= &initarray_generic.cache);
memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
sizeof(struct arraycache_init));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
spin_lock_init(&ptr->lock);
kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
}
/* 5) Replace the bootstrap kmem_cache_node */
{
int nid;
向slab系统申请arraycache_init,用来取代之前使用的静态的kmem_cache_node
for_each_online_node(nid) {
init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
init_list(kmalloc_caches[INDEX_AC],
&init_kmem_cache_node[SIZE_AC + nid], nid);
if (INDEX_AC != INDEX_NODE) {
init_list(kmalloc_caches[INDEX_NODE],
&init_kmem_cache_node[SIZE_NODE + nid], nid);
}
}
}
再分配一些其他大小的kmem_cache
create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
}
slab系统依赖于伙伴系统,用于申请小内存时,把一页划分成多个object,来实现内存的更有效使用。
上面的代码,在slab系统没起来之间,只好用一些静态的结构来实现slab系统的初始化,等初始化完以后,再从slab系统分配内存,把这些结构替换掉。
上述代码的第一阶段,先为名字叫做kmem_cache和结构申请slab,这边kmem_cache使用了静态结构,里面的kmem_cache_node和array_cache都使用静态结构。
第二阶段,为名字叫kmalloc-ac的结构申请slab,这个时候kmem_cache slab已经初始化好,所以这边只使用了两个静态结构kmem_cache_node和array_cache。
第三阶段,为名字叫kmalloc-node的结构申请slab,这个时候array_cache slab也已经ok,所以只是用了kmem_cache_node静态结构。
第四第五阶段,除了第一个静态结构kmem_cache外,把其他的静态结构用新分配的slab替换掉。
下面先看一下setup_node_pointer
static void setup_node_pointer(struct kmem_cache *cachep)
{
cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
}
由于一开始,node是个struct kmem_cache_node **类型,所以把node指向array[1],这边cpu cache只是用array[0],然后就可以使用node[0]这样的方式给node赋值。接着调用set_up_node,使用静态变量为cachep->node赋值,然后就可以使用cachep->node了。
然后调用create_boot_cache初始化kmem_cache的slab,slab里object的大小为kmem_cache的大小加struct kmem_cache_node *的大小。
create_boot_cache
-------->__kmem_cache_create
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
size_t left_over, slab_size, ralign;
gfp_t gfp;
int err;
size_t size = cachep->size;
/*
* Check that size is in terms of words. This is needed to avoid
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/
对size进行字节对齐
if (size & (BYTES_PER_WORD - 1)) {
size += (BYTES_PER_WORD - 1);
size &= ~(BYTES_PER_WORD - 1);
}
/*
* Redzoning and user store require word alignment or possibly larger.
* Note this will be overridden by architecture or caller mandated
* alignment if either is greater than BYTES_PER_WORD.
*/
if (flags & SLAB_STORE_USER)
ralign = BYTES_PER_WORD;
if (flags & SLAB_RED_ZONE) {
ralign = REDZONE_ALIGN;
/* If redzoning, ensure that the second redzone is suitably
* aligned, by adjusting the object size accordingly. */
size += REDZONE_ALIGN - 1;
size &= ~(REDZONE_ALIGN - 1);
}
/* 3) caller mandated alignment */
if (ralign < cachep->align) {
ralign = cachep->align;
}
/* disable debug if necessary */
if (ralign > __alignof__(unsigned long long))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
*/
cachep->align = ralign;
if (slab_is_available())
gfp = GFP_KERNEL;
else
gfp = GFP_NOWAIT;
setup_node_pointer(cachep);
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
* it too early on. Always use on-slab management when
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
!(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/
flags |= CFLGS_OFF_SLAB;
size = ALIGN(size, cachep->align);
计算分配slab需要多少空间,最后有几个object,left_over 是浪费的空间
left_over = calculate_slab_order(cachep, size, cachep->align, flags);
if (!cachep->num)
return -E2BIG;
计算slab管理结构的大小
slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+ sizeof(struct slab), cachep->align);
/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/
CFLGS_OFF_SLAB 都是没有设置的
if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
flags &= ~CFLGS_OFF_SLAB;
left_over -= slab_size;
}
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
slab_size =
cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
* poisoning, then it's going to smash the contents of
* the redzone and userword anyhow, so switch them off.
*/
if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
}
cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
if (cachep->colour_off < cachep->align)
cachep->colour_off = cachep->align;
cachep->colour = left_over / cachep->colour_off;
cachep->slab_size = slab_size;
cachep->flags = flags;
cachep->allocflags = 0;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
cachep->allocflags |= GFP_DMA;
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
if (flags & CFLGS_OFF_SLAB) {
cachep->slabp_cache = kmalloc_slab(slab_size, 0u);
/*
* This is a possibility for one of the malloc_sizes caches.
* But since we go off slab only for object size greater than
* PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
* this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
}
设置array_cache
err = setup_cpu_cache(cachep, gfp);
if (err) {
__kmem_cache_shutdown(cachep);
return err;
}
if (flags & SLAB_DEBUG_OBJECTS) {
/*
* Would deadlock through slab_destroy()->call_rcu()->
* debug_object_activate()->kmem_cache_alloc().
*/
WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
slab_set_debugobj_lock_classes(cachep);
} else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
on_slab_lock_classes(cachep);
return 0;
}
这边比较重要的两个函数是calculate_slab_order和setup_cpu_cache
static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, size_t align, unsigned long flags)
{
unsigned long offslab_limit;
size_t left_over = 0;
int gfporder;
for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
unsigned int num;
size_t remainder;
计算object的数量
cache_estimate(gfporder, size, align, flags, &remainder, &num);
if (!num)
continue;
if (flags & CFLGS_OFF_SLAB) {
/*
* Max number of objs-per-slab for caches which
* use off-slab slabs. Needed to avoid a possible
* looping condition in cache_grow().
*/
offslab_limit = size - sizeof(struct slab);
offslab_limit /= sizeof(kmem_bufctl_t);
if (num > offslab_limit)
break;
}
/* Found something acceptable - save it away */
cachep->num = num; object的数量
cachep->gfporder = gfporder; 需要1<<gfporder个page
left_over = remainder;
/*
* A VFS-reclaimable slab tends to have most allocations
* as GFP_NOFS and we really don't want to have to be allocating
* higher-order pages when we are unable to shrink dcache.
*/
if (flags & SLAB_RECLAIM_ACCOUNT)
break;
/*
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/
if (gfporder >= slab_max_order)
break;
/*
* Acceptable internal fragmentation?
*/
if (left_over * 8 <= (PAGE_SIZE << gfporder))
break;
}
return left_over;
}
通过cache_estimate来计算分配几个object:
static void cache_estimate(unsigned long gfporder, size_t buffer_size,
size_t align, int flags, size_t *left_over,
unsigned int *num)
{
int nr_objs;
size_t mgmt_size;
size_t slab_size = PAGE_SIZE << gfporder;
/*
* The slab management structure can be either off the slab or
* on it. For the latter case, the memory allocated for a
* slab is used for:
*
* - The struct slab
* - One kmem_bufctl_t for each object
* - Padding to respect alignment of @align
* - @buffer_size bytes for each object
*
* If the slab management structure is off the slab, then the
* alignment will already be calculated into the size. Because
* the slabs are all pages aligned, the objects will be at the
* correct alignment when allocated.
*/
if (flags & CFLGS_OFF_SLAB) {
mgmt_size = 0;
nr_objs = slab_size / buffer_size;
if (nr_objs > SLAB_LIMIT)
nr_objs = SLAB_LIMIT;
} else {
/*
* Ignore padding for the initial guess. The padding
* is at most @align-1 bytes, and @buffer_size is at
* least @align. In the worst case, this result will
* be one greater than the number of objects that fit
* into the memory allocation when taking the padding
* into account.
*/
object的数量为页大小减去管理结构slab的大小,除以每个object的大小,其中,有一个object,就需要一个
kmem_bufctl_t来指示器位置
nr_objs = (slab_size - sizeof(struct slab)) /
(buffer_size + sizeof(kmem_bufctl_t));
/*
* This calculated number will be either the right
* amount, or one greater than what we want.
*/
if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
> slab_size)
nr_objs--;
if (nr_objs > SLAB_LIMIT)
nr_objs = SLAB_LIMIT;
计算slab管理结构的大小
mgmt_size = slab_mgmt_size(nr_objs, align);
}
*num = nr_objs;
浪费的空间
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
}
通过上面计算出了需要申请的页的大小,以及在这片空间中,可以分配几个object等一系列初始化,接着调用setup_cpu_cache
static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
if (slab_state >= FULL)
return enable_cpucache(cachep, gfp);
if (slab_state == DOWN) {
/*
* Note: Creation of first cache (kmem_cache).
* The setup_node is taken care
* of by the caller of __kmem_cache_create
*/
cachep->array[smp_processor_id()] = &initarray_generic.cache;
slab_state = PARTIAL;
} else if (slab_state == PARTIAL) {
/*
* Note: the second kmem_cache_create must create the cache
* that's used by kmalloc(24), otherwise the creation of
* further caches will BUG().
*/
cachep->array[smp_processor_id()] = &initarray_generic.cache;
/*
* If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
* the second cache, then we need to set up all its node/,
* otherwise the creation of further caches will BUG().
*/
set_up_node(cachep, SIZE_AC);
if (INDEX_AC == INDEX_NODE)
slab_state = PARTIAL_NODE;
else
slab_state = PARTIAL_ARRAYCACHE;
} else {
/* Remaining boot caches */
cachep->array[smp_processor_id()] =
kmalloc(sizeof(struct arraycache_init), gfp);
if (slab_state == PARTIAL_ARRAYCACHE) {
set_up_node(cachep, SIZE_NODE);
slab_state = PARTIAL_NODE;
} else {
int node;
for_each_online_node(node) {
cachep->node[node] =
kmalloc_node(sizeof(struct kmem_cache_node),
gfp, node);
BUG_ON(!cachep->node[node]);
kmem_cache_node_init(cachep->node[node]);
}
}
}
cachep->node[numa_mem_id()]->next_reap =
jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
cpu_cache_get(cachep)->batchcount = 1;
cpu_cache_get(cachep)->touched = 0;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
return 0;
}
上面函数由于slab_state最初是DOWN,所以用静态变量initarray_generic初始化cachep->array,注意,然后会设置slab_state = PARTIAL,后面为array_cache申请slab的时候,还是指向同一个静态变量initarray_generic,不过由于batchcount设置为1,所以每次只有一个object在这里,应该问题不大。这边初始化完cachep->array以后,就可以申请kmem_cache了,并且设置avail为0,说明cpu cache中暂时没有slab,需要申请分配,batchcount为1,每次向slab取一个,真正为slab分配空间是向其申请内存的时候,接着往下看代码,这边初始化完了kmem_cache,然后初始化kmalloc-ac。
kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
可以看到初始化完以后,会把得到的kmem_cache放到kmalloc_caches中,kmalloc_caches是个数组,slab通过一个计算方式,会把不通大小的slab放入kmalloc_caches数组中,这样要申请对应大小范围的内存就可以根据索引从数组中取出kmem_cache。
struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
unsigned long flags)
{
向slab系统申请kmem_cache 空间
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
if (!s)
panic("Out of memory when creating slab %s\n", name);
创建自己的slab
create_boot_cache(s, name, size, flags);
list_add(&s->list, &slab_caches);
s->refcount = 1;
return s;
}
kmem_cache_alloc
--------->slab_alloc
----------->__do_cache_alloc
------------>____cache_alloc
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
bool force_refill = false;
check_irq_off();
取出cachep->array
ac = cpu_cache_get(cachep);
第一次进来,avail为0
if (likely(ac->avail)) {
ac->touched = 1;
objp = ac_get_obj(cachep, ac, flags, false);
/*
* Allow for the possibility all avail objects are not allowed
* by the current flags
*/
if (objp) {
STATS_INC_ALLOCHIT(cachep);
goto out;
}
force_refill = true;
}
STATS_INC_ALLOCMISS(cachep);
这边进行分配object
objp = cache_alloc_refill(cachep, flags, force_refill);
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
*/
ac = cpu_cache_get(cachep);
out:
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
* treat the array pointers as a reference to the object.
*/
if (objp)
kmemleak_erase(&ac->entry[ac->avail]);
return objp;
}
调用cache_alloc_refill分配object
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
bool force_refill)
{
int batchcount;
struct kmem_cache_node *n;
struct array_cache *ac;
int node;
check_irq_off();
node = numa_mem_id();
if (unlikely(force_refill))
goto force_grow;
retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT;
}
n = cachep->node[node];
BUG_ON(ac->avail > 0 || !n);
spin_lock(&n->list_lock);
/* See if we can refill from the shared array */
if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
n->shared->touched = 1;
goto alloc_done;
}
while (batchcount > 0) {
struct list_head *entry;
struct slab *slabp;
/* Get slab alloc is to come from. */
entry = n->slabs_partial.next;
if (entry == &n->slabs_partial) {
n->free_touched = 1;
entry = n->slabs_free.next;
if (entry == &n->slabs_free) 第一次进来,node都是空的,去执行
must_grow
goto must_grow;
}
slabp = list_entry(entry, struct slab, list);
check_slabp(cachep, slabp);
check_spinlock_acquired(cachep);
/*
* The slab was either on partial or free list so
* there must be at least one object available for
* allocation.
*/
BUG_ON(slabp->inuse >= cachep->num);
while (slabp->inuse < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
node));
}
check_slabp(cachep, slabp);
/* move slabp to correct slabp list: */
list_del(&slabp->list);
if (slabp->free == BUFCTL_END)
list_add(&slabp->list, &n->slabs_full);
else
list_add(&slabp->list, &n->slabs_partial);
}
must_grow:
n->free_objects -= ac->avail;
alloc_done:
spin_unlock(&n->list_lock);
if (unlikely(!ac->avail)) {
int x;
force_grow:
申请空间
x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
node = numa_mem_id();
/* no objects in sight? abort */
if (!x && (ac->avail == 0 || force_refill))
return NULL;
申请完再尝试把object放到array中
if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
}
ac->touched = 1;
return ac_get_obj(cachep, ac, flags, force_refill);
}
上面函数,刚开始是没有为slab申请object的,所以执行must_grow,为node分配空间,最后再执行retry,执行ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,node));把object 从node 挂载到cpu array中。
先看cache_grow
static int cache_grow(struct kmem_cache *cachep,
gfp_t flags, int nodeid, void *objp)
{
struct slab *slabp;
size_t offset;
gfp_t local_flags;
struct kmem_cache_node *n;
/*
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
BUG_ON(flags & GFP_SLAB_BUG_MASK);
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
/* Take the node list lock to change the colour_next on this node */
check_irq_off();
n = cachep->node[nodeid];
spin_lock(&n->list_lock);
/* Get colour for the slab, and cal the next value. */
offset = n->colour_next;
n->colour_next++;
if (n->colour_next >= cachep->colour)
n->colour_next = 0;
spin_unlock(&n->list_lock);
offset *= cachep->colour_off;
if (local_flags & __GFP_WAIT)
local_irq_enable();
/*
* The test for missing atomic flag is performed here, rather than
* the more obvious place, simply to reduce the critical path length
* in kmem_cache_alloc(). If a caller is seriously mis-behaving they
* will eventually be caught here (where it matters).
*/
kmem_flagcheck(cachep, flags);
/*
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
if (!objp)
向伙伴系统申请page,返回该page指向的虚拟地址
objp = kmem_getpages(cachep, local_flags, nodeid);
if (!objp)
goto failed;
/* Get slab management. */
利用申请到的内存,初始化slab
slabp = alloc_slabmgmt(cachep, objp, offset,
local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
if (!slabp)
goto opps1;
把slab和page进行关联
slab_map_pages(cachep, slabp, objp);
在slab管理数据后面,为kmem_bufctl_t结构赋值,每个kmem_bufctl_t结构里面放入下一个object
的index
cache_init_objs(cachep, slabp);
if (local_flags & __GFP_WAIT)
local_irq_disable();
check_irq_off();
spin_lock(&n->list_lock);
/* Make slab active. */
这时候slab结构没有分配过,所以挂载在slabs_free上
list_add_tail(&slabp->list, &(n->slabs_free));
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num;
spin_unlock(&n->list_lock);
return 1;
opps1:
kmem_freepages(cachep, objp);
failed:
if (local_flags & __GFP_WAIT)
local_irq_disable();
return 0;
}
kmem_getpages最终调用到__alloc_pages,向伙伴系统申请page,返回该page代表的虚拟地址。然后利用申请到的内存块,初始化slab:
static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
int colour_off, gfp_t local_flags,
int nodeid)
{
struct slab *slabp;
if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
slabp = kmem_cache_alloc_node(cachep->slabp_cache,
local_flags, nodeid);
/*
* If the first object in the slab is leaked (it's allocated
* but no one has a reference to it), we want to make sure
* kmemleak does not treat the ->s_mem pointer as a reference
* to the object. Otherwise we will not report the leak.
*/
kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
local_flags);
if (!slabp)
return NULL;
} else {
slabp = objp + colour_off;
colour_off += cachep->slab_size;
}
slabp->inuse = 0;
slabp->colouroff = colour_off;
slabp->s_mem = objp + colour_off;记录object开始的地址
slabp->nodeid = nodeid;
slabp->free = 0;
return slabp;
}
static void cache_init_objs(struct kmem_cache *cachep,
struct slab *slabp)
{
int i;
for (i = 0; i < cachep->num; i++) {
void *objp = index_to_obj(cachep, slabp, i);
if (cachep->ctor)
cachep->ctor(objp);
slab_bufctl(slabp)[i] = i + 1;
} 初始化slab管理结构后面的kmem_bufctl_t数组,该数组的值代表每个object的index
slab_bufctl(slabp)[i - 1] = BUFCTL_END;
}
所以初始化好以后,slabs_free中已经有值,在执行之前的retry:
while (slabp->inuse < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
node));
}
batchcount为1,只释放一个object。ac_put_obj把object从slab放入array中,看下slab_get_obj:
static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
int nodeid)
{
void *objp = index_to_obj(cachep, slabp, slabp->free);
kmem_bufctl_t next;
slabp->inuse++;
next = slab_bufctl(slabp)[slabp->free];
#if DEBUG
slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
WARN_ON(slabp->nodeid != nodeid);
#endif
slabp->free = next;
return objp;
}
可以看到每次更新slabp->free的值,而这个值从kmem_bufctl_t数组结构中取,数组中的值应该是这样的kmem_bufctl[free]=free+1,index为n的值为n+1,指向下一个object。这样array中就有object了。返回kmem_cache的空间。在create_kmalloc_cache中,又调用create_boot_cache,创建kmalloc-ac的slab。这边调用需要注意下:
create_boot_cache
-------->__kmem_cache_create
---------->setup_cpu_cache
在setup_cpu_cache中,走slab_state == PARTIAL这一支。
else if (slab_state == PARTIAL) {
/*
* Note: the second kmem_cache_create must create the cache
* that's used by kmalloc(24), otherwise the creation of
* further caches will BUG().
*/
cachep->array[smp_processor_id()] = &initarray_generic.cache;
/*
* If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
* the second cache, then we need to set up all its node/,
* otherwise the creation of further caches will BUG().
*/
set_up_node(cachep, SIZE_AC);
if (INDEX_AC == INDEX_NODE)
slab_state = PARTIAL_NODE;
else
slab_state = PARTIAL_ARRAYCACHE;
}
因为cachep->array的slab正在创建,所以这边还是只能用静态的,然后调用set_up_node,从init_kmem_cache_node数组中拿一个node赋值给cachep->node[0],最后设置slab_state = PARTIAL_ARRAYCACHE;这样kmalloc-ac的初始化就做完了,再看一下kmalloc-node的初始化有什么不同,这个时候,kmem_cache和array_cache的slab都已经初始化完毕了,所以只有kmem_cache_node才需要使用静态变量。初始化过程和上面基本一样,重点看下这里:
create_boot_cache
-------->__kmem_cache_create
---------->setup_cpu_cache
{
/* Remaining boot caches */
cachep->array[smp_processor_id()] =
kmalloc(sizeof(struct arraycache_init), gfp);
if (slab_state == PARTIAL_ARRAYCACHE) {
set_up_node(cachep, SIZE_NODE);
slab_state = PARTIAL_NODE;
} else {
int node;
for_each_online_node(node) {
cachep->node[node] =
kmalloc_node(sizeof(struct kmem_cache_node),
gfp, node);
BUG_ON(!cachep->node[node]);
kmem_cache_node_init(cachep->node[node]);
}
}
}
可以看到cachep->array通过kmalloc动态分配,而kmem_cache_node还是使用静态设置,和之前描述的一样。
kmalloc
------>__kmalloc
-------->__do_kmalloc
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
unsigned long caller)
{
struct kmem_cache *cachep;
void *ret;
/* If you want to save a few bytes .text space: replace
* __ with kmem_.
* Then kmalloc uses the uninlined functions instead of the inline
* functions.
*/
cachep = kmalloc_slab(size, flags); 获取可以分配当前szie空间的kmem_cache 结构
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
ret = slab_alloc(cachep, flags, caller);从kmem_cache 结构中获取object
trace_kmalloc(caller, ret,
size, cachep->size, flags);
return ret;
}
该函数先用kmalloc_slab获取kmem_cache结构:
struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
int index;
if (size > KMALLOC_MAX_SIZE) {
WARN_ON_ONCE(!(flags & __GFP_NOWARN));
return NULL;
}
if (size <= 192) {
if (!size)
return ZERO_SIZE_PTR;
index = size_index[size_index_elem(size)];
} else
index = fls(size - 1); 根据size算出index
#ifdef CONFIG_ZONE_DMA
if (unlikely((flags & GFP_DMA)))
return kmalloc_dma_caches[index];
#endif
利用index从kmalloc_caches中查找
return kmalloc_caches[index];
}
前面已经在kmalloc_caches中放入了初始化的kmalloc-ac,这边就返回kmalloc-ac的kmem_cache结构,然后从中分配object。这样slab系统的一些基本结构都已经初始化完毕,但是里面使用了很多静态结构,所以普在kmem_cache_init函数后半段,用动态分配取代这部分静态结构:
/* 4) Replace the bootstrap head arrays */
{
struct array_cache *ptr;
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
memcpy(ptr, cpu_cache_get(kmem_cache),
sizeof(struct arraycache_init));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
spin_lock_init(&ptr->lock);
kmem_cache->array[smp_processor_id()] = ptr;
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
!= &initarray_generic.cache);
memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
sizeof(struct arraycache_init));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
spin_lock_init(&ptr->lock);
kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
}
/* 5) Replace the bootstrap kmem_cache_node */
{
int nid;
for_each_online_node(nid) {
init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
init_list(kmalloc_caches[INDEX_AC],
&init_kmem_cache_node[SIZE_AC + nid], nid);
if (INDEX_AC != INDEX_NODE) {
init_list(kmalloc_caches[INDEX_NODE],
&init_kmem_cache_node[SIZE_NODE + nid], nid);
}
}
}
最后调用create_kmalloc_caches:
void __init create_kmalloc_caches(unsigned long flags)
{
int i;
/*
* Patch up the size_index table if we have strange large alignment
* requirements for the kmalloc array. This is only the case for
* MIPS it seems. The standard arches will not generate any code here.
*
* Largest permitted alignment is 256 bytes due to the way we
* handle the index determination for the smaller caches.
*
* Make sure that nothing crazy happens if someone starts tinkering
* around with ARCH_KMALLOC_MINALIGN
*/
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
int elem = size_index_elem(i);
if (elem >= ARRAY_SIZE(size_index))
break;
size_index[elem] = KMALLOC_SHIFT_LOW;
}
if (KMALLOC_MIN_SIZE >= 64) {
/*
* The 96 byte size cache is not used if the alignment
* is 64 byte.
*/
for (i = 64 + 8; i <= 96; i += 8)
size_index[size_index_elem(i)] = 7;
}
if (KMALLOC_MIN_SIZE >= 128) {
/*
* The 192 byte sized cache is not used if the alignment
* is 128 byte. Redirect kmalloc to use the 256 byte cache
* instead.
*/
for (i = 128 + 8; i <= 192; i += 8)
size_index[size_index_elem(i)] = 8;
}
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
if (!kmalloc_caches[i]) {
kmalloc_caches[i] = create_kmalloc_cache(NULL,
1 << i, flags);
}
/*
* Caches that are not of the two-to-the-power-of size.
* These have to be created immediately after the
* earlier power of two caches
*/
if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
}
/* Kmalloc array is now usable */
slab_state = UP;
for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
struct kmem_cache *s = kmalloc_caches[i];
char *n;
if (s) {
n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));
BUG_ON(!n);
s->name = n;
}
}
#ifdef CONFIG_ZONE_DMA
for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
struct kmem_cache *s = kmalloc_caches[i];
if (s) {
int size = kmalloc_size(i);
char *n = kasprintf(GFP_NOWAIT,
"dma-kmalloc-%d", size);
BUG_ON(!n);
kmalloc_dma_caches[i] = create_kmalloc_cache(n,
size, SLAB_CACHE_DMA | flags);
}
}
#endif
}
上面函数应该是初始化某些size的slab,然后放入kmalloc_caches数组中,后面要申请属于这个size范围的slab,就可以直接索引kmalloc_caches数组的元素,进行分配了。至此,slab系统初始化完毕。可以用一个图来描述主要结构相互之间的关系:
下面贴出一个博文,作为借鉴: