slub分配器对外主要提供几个接口:
//创建一个高速缓冲区
struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
unsigned long,
void (*)(void *));
//销毁高速缓冲区
void kmem_cache_destroy(struct kmem_cache *);
//从高速缓冲申请一个object
static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
gfp_t flags, int node);
//释放object
void kmem_cache_free(struct kmem_cache *, void *);
//创建一个高速缓冲区
struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
unsigned long,
void (*)(void *));
//销毁高速缓冲区
void kmem_cache_destroy(struct kmem_cache *);
//从高速缓冲申请一个object
static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
gfp_t flags, int node);
//释放object
void kmem_cache_free(struct kmem_cache *, void *);
下面直接看代码:
struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
}
/*
创建一个高速缓冲区,
先从现有的kmem_cache中查找一个能否满足需要的缓冲区,
不能满足的话就调用slub alloc,申请page,构造一个高速缓冲区
*/
struct kmem_cache *
kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *),
struct kmem_cache *parent_cache)
{
struct kmem_cache *s = NULL;
int err = 0;
get_online_cpus();
mutex_lock(&slab_mutex);
if (!kmem_cache_sanity_check(memcg, name, size) == 0)
goto out_locked;
/*
* Some allocators will constraint the set of valid flags to a subset
* of all flags. We expect them to define CACHE_CREATE_MASK in this
* case, and we'll just provide them with a sanitized version of the
* passed flags.
*/
flags &= CACHE_CREATE_MASK;
//找到可合并的kmem_cache
s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
if (s)
goto out_locked;
//最终调用slab_alloc,
s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
if (s) {
s->object_size = s->size = size;
s->align = calculate_alignment(flags, align, size);
s->ctor = ctor;
if (memcg_register_cache(memcg, s, parent_cache)) {
kmem_cache_free(kmem_cache, s);
err = -ENOMEM;
goto out_locked;
}
s->name = kstrdup(name, GFP_KERNEL);
if (!s->name) {
kmem_cache_free(kmem_cache, s);
err = -ENOMEM;
goto out_locked;
}
err = __kmem_cache_create(s, flags);
if (!err) {
s->refcount = 1;
list_add(&s->list, &slab_caches);//将缓冲区加入全局list slab_caches
memcg_cache_list_add(memcg, s);
} else {
kfree(s->name);
kmem_cache_free(kmem_cache, s);
}
} else
err = -ENOMEM;
out_locked:
mutex_unlock(&slab_mutex);
put_online_cpus();
if (err) {
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
else {
printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
name, err);
dump_stack();
}
return NULL;
}
return s;
}
void kmem_cache_destroy(struct kmem_cache *s)
{
/* Destroy all the children caches if we aren't a memcg cache */
kmem_cache_destroy_memcg_children(s);
get_online_cpus();
mutex_lock(&slab_mutex);
s->refcount--;
if (!s->refcount) {
list_del(&s->list);//从slab_caches list中删除
if (!__kmem_cache_shutdown(s)) {//释放这块高速缓冲区
mutex_unlock(&slab_mutex);
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
memcg_release_cache(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);//释放s到kmem_cache
} else {
list_add(&s->list, &slab_caches);
mutex_unlock(&slab_mutex);
printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
s->name);
dump_stack();
}
} else {
mutex_unlock(&slab_mutex);
}
put_online_cpus();
}
/*
* Release all resources used by a slab cache.
*/
//释放slub cache所用的资源,包括cpu cache,node partial list中,最终slub占用页面
//释放到伙伴系统
static inline int kmem_cache_close(struct kmem_cache *s)
{
int node;
flush_all(s);
/* Attempt to free all objects */
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
free_partial(s, n);
if (n->nr_partial || slabs_node(s, node))
return 1;
}
free_percpu(s->cpu_slab);//释放percpu cpu_slab结构
free_kmem_cache_nodes(s);
return 0;
}
//用来flush cpu本地cache,刷新之后cpu本地将没有slub 缓冲区可以用
static void flush_all(struct kmem_cache *s)
{
on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
}
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
struct page *page, *h;
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {//该slub缓冲区还未使用
remove_partial(n, page);//从node partial list移除这个缓冲区
discard_slab(s, page);//将page释放到伙伴系统中
} else {
list_slab_objects(s, page,
"Objects remaining in %s on kmem_cache_close()");
}
}
}
static void free_kmem_cache_nodes(struct kmem_cache *s)
{
int node;
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = s->node[node];
if (n)
kmem_cache_free(kmem_cache_node, n);//最终调用slab free
s->node[node] = NULL;
}
}
/*
对于从缓冲区申请一个object使用时,申请顺序如下:
先从本地cpu cache 正在使用的缓冲区申请,申请不到的话就从cpu cache partial list
先获取一个缓冲区,如果还是申请不到就从node partial list获取,申请失败时将从伙伴系统申请新的page。
释放一个对象时也是遵循同样的顺序。
*/
static __always_inline void *slab_alloc(struct kmem_cache *s,
gfp_t gfpflags, unsigned long addr)
{
return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
}
//先尝试从本地缓冲申请,不成功就走慢速申请路径,从伙伴系统中申请page
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr)
{
void **object;
struct kmem_cache_cpu *c;
struct page *page;
unsigned long tid;
//slab 分配时用于检查gfp flag是否合法
if (slab_pre_alloc_hook(s, gfpflags))
return NULL;
//获取正确的kmem_cache
s = memcg_kmem_get_cache(s, gfpflags);
redo:
/*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
* enabled. We may switch back and forth between cpus while
* reading from one cpu area. That does not matter as long
* as we end up on the original cpu again when doing the cmpxchg.
*
* Preemption is disabled for the retrieval of the tid because that
* must occur from the current processor. We cannot allow rescheduling
* on a different processor between the determination of the pointer
* and the retrieval of the tid.
*/
//禁止调度器抢占,获取本cpu的kmem_cache_cpu
preempt_disable();
c = __this_cpu_ptr(s->cpu_slab);
/*
* The transaction ids are globally unique per cpu and per operation on
* a per cpu queue. Thus they can be guarantee that the cmpxchg_double
* occurs on the right processor and that there was no operation on the
* linked list in between.
*/
tid = c->tid;
preempt_enable();
object = c->freelist;//本地缓冲区空闲对象
page = c->page;//slab 物理页框
if (unlikely(!object || !node_match(page, node)))//本地cpu没有空闲对象或者page不属于node时,走慢速分配流程
object = __slab_alloc(s, gfpflags, node, addr, c);
else {
//本地cpu快速分配
void *next_object = get_freepointer_safe(s, object);
/*
* The cmpxchg will only match if there was no additional
* operation and if we are on the right processor.
*
* The cmpxchg does the following atomically (without lock semantics!)
* 1. Relocate first pointer to the current per cpu area.
* 2. Verify that tid and freelist have not been changed
* 3. If they were not changed replace tid and freelist
*
* Since this is without lock semantics the protection is only against
* code executing on this cpu *not* from access by other cpus.
*/
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
object, tid,
next_object, next_tid(tid)))) {
note_cmpxchg_failure("slab_alloc", s, tid);
goto redo;
}
prefetch_freepointer(s, next_object);
stat(s, ALLOC_FASTPATH);
}
if (unlikely(gfpflags & __GFP_ZERO) && object)
memset(object, 0, s->object_size);
slab_post_alloc_hook(s, gfpflags, object);
return object;
}
/*
先从cpu本地缓冲cpu_slab page中申请一个object,如果没有空闲可用的object,
就将cpu_slab partial list获取一个缓冲区放入cpu_slab 的page中,
然后再从page中申请object,如果也失败的话先从node partial list获取缓冲区,
如果失败将从伙伴系统中申请page,加入到cpu_slab的page
*/
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *freelist;
struct page *page;
unsigned long flags;
local_irq_save(flags);
#ifdef CONFIG_PREEMPT
/*
* We may have been preempted and rescheduled on a different
* cpu before disabling interrupts. Need to reload cpu area
* pointer.
*/
c = this_cpu_ptr(s->cpu_slab);
#endif
page = c->page;
if (!page)
goto new_slab;
redo:
//slab page跟node不匹配,需要释放cpu slab 缓冲,再重新申请
if (unlikely(!node_match(page, node))) {
stat(s, ALLOC_NODE_MISMATCH);
//将cpu cache page加入node partial list
deactivate_slab(s, page, c->freelist);
c->page = NULL;
c->freelist = NULL;
goto new_slab;
}
/*
* By rights, we should be searching for a slab page that was
* PFMEMALLOC but right now, we are losing the pfmemalloc
* information when the page leaves the per-cpu allocator
*/
if (unlikely(!pfmemalloc_match(page, gfpflags))) {
deactivate_slab(s, page, c->freelist);
c->page = NULL;
c->freelist = NULL;
goto new_slab;
}
/* must check again c->freelist in case of cpu migration or IRQ */
freelist = c->freelist;//本地空闲对象列表
if (freelist)
goto load_freelist;
stat(s, ALLOC_SLOWPATH);
//将page freelist置空,同时返回未置空前的freelist
freelist = get_freelist(s, page);
if (!freelist) {
c->page = NULL;
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
stat(s, ALLOC_REFILL);
load_freelist:
/*
* freelist is pointing to the list of objects to be used.
* page is pointing to the page from which the objects are obtained.
* That page must be frozen for per cpu allocations to work.
*/
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
local_irq_restore(flags);
return freelist;
new_slab:
if (c->partial) {
page = c->page = c->partial;
c->partial = page->next;
stat(s, CPU_PARTIAL_ALLOC);
c->freelist = NULL;
goto redo;
}
freelist = new_slab_objects(s, gfpflags, node, &c);
if (unlikely(!freelist)) {
if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
slab_out_of_memory(s, gfpflags, node);
local_irq_restore(flags);
return NULL;
}
page = c->page;
if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
goto load_freelist;
/* Only entered in the debug case */
if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
goto new_slab; /* Slab failed checks. Next slab needed */
deactivate_slab(s, page, get_freepointer(s, freelist));
c->page = NULL;
c->freelist = NULL;
local_irq_restore(flags);
return freelist;
}
/*
从缓冲区申请一个object,
1. 先从 当前node partial list申请一个partial缓冲区,将其加入本地cpu缓冲,同时返回首个object地址
2.申请失败的话,就从伙伴系统申请page,然后将其加入cpu本地cache中
*/
static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
int node, struct kmem_cache_cpu **pc)
{
void *freelist;
struct kmem_cache_cpu *c = *pc;
struct page *page;
//尝试从当前node partial list中找到可用的slub,如果失败需要从伙伴系统申请新的page
freelist = get_partial(s, flags, node, c);
if (freelist)
return freelist;
//申请page,填充page信息,将其加入cpu 本地cache中
page = new_slab(s, flags, node);
if (page) {
c = __this_cpu_ptr(s->cpu_slab);
if (c->page)//cpu 当前正在用的slub不为空,需要将其释放掉
flush_slab(s, c);
/*
* No other reference to the page yet so we can
* muck around with it freely without cmpxchg
*/
freelist = page->freelist;
page->freelist = NULL;
stat(s, ALLOC_SLAB);
c->page = page;
*pc = c;
} else
freelist = NULL;
return freelist; //slub第一个object地址
}
/*
* Try to allocate a partial slab from a specific node.
*/
//尝试从指定node partial list中获取一个slub缓冲区,然后加入kmem_cache_cpu
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
struct kmem_cache_cpu *c, gfp_t flags)
{
struct page *page, *page2;
void *object = NULL;
int available = 0;
int objects;
/*
* Racy check. If we mistakenly see no partial slabs then we
* just allocate an empty slab. If we mistakenly try to get a
* partial slab and there is none available then get_partials()
* will return NULL.
*/
if (!n || !n->nr_partial)
return NULL;
spin_lock(&n->list_lock);
//遍历node n的partial list
list_for_each_entry_safe(page, page2, &n->partial, lru) {
void *t;
if (!pfmemalloc_match(page, flags))
continue;
//从node partial list获取一个slub
t = acquire_slab(s, n, page, object == NULL, &objects);
if (!t)
break;
available += objects;//可用object数量
if (!object) {//将获取的partial缓冲区加入本地cpu cache中
c->page = page;
stat(s, ALLOC_FROM_PARTIAL);
object = t;
} else {
put_cpu_partial(s, page, 0);//获取的page跟cpu partial 链接起来
stat(s, CPU_PARTIAL_NODE);
}
if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
break;
}
spin_unlock(&n->list_lock);
return object;
}
//给kmem_cache 缓冲区申请page,然后填充page内的object debug信息,初始化page信息
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
void *start;
void *last;
void *p;
int order;
BUG_ON(flags & GFP_SLAB_BUG_MASK);
page = allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
if (!page)
goto out;
order = compound_order(page);//???
inc_slabs_node(s, page_to_nid(page), page->objects);
memcg_bind_pages(s, order);
page->slab_cache = s;
__SetPageSlab(page);
if (page->pfmemalloc)
SetPageSlabPfmemalloc(page);
start = page_address(page);//获取page对应的虚拟地址
//设置了SLAB_POISON的调试项,需要将slub对应内存全填充为0x5a
if (unlikely(s->flags & SLAB_POISON))
memset(start, POISON_INUSE, PAGE_SIZE << order);
last = start;
kasan_poison_slab(page);
//更新slub缓冲区内的object信息
for_each_object(p, s, start, page->objects) {//依次遍历每个object,size表示整个object大小
setup_object(s, page, last);//根据debug选项,构造object数据,last指向object
set_freepointer(s, last, p);//p赋值给下个free pointer
last = p;
}
setup_object(s, page, last);
set_freepointer(s, last, NULL);
page->freelist = start;
page->inuse = page->objects; //object数量
page->frozen = 1;
out:
return page;
}
/* Object debug checks for alloc/free paths */
//这个用来构造object 调试信息,比如poison,red zone,call stack
static void setup_object_debug(struct kmem_cache *s, struct page *page,
void *object)
{
if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
return;
init_object(s, object, SLUB_RED_INACTIVE);
init_tracking(s, object);
}
/*
一个object前面object_size-1 填充为0x6b,第object_size个字节为0xa5,
接下来后面(s->inuse - s->object_size)个字节填充为redzone 0xbb,
inuse表示object实际大小inuse==offset
*/
static void init_object(struct kmem_cache *s, void *object, u8 val)
{
u8 *p = object;
if (s->flags & __OBJECT_POISON) {
memset(p, POISON_FREE, s->object_size - 1);
p[s->object_size - 1] = POISON_END;
}
if (s->flags & SLAB_RED_ZONE)
memset(p + s->object_size, val, s->inuse - s->object_size);
}
//设置SLAB_STORE_USER之后,需要构造slub对象allocation/free调用栈
static void init_tracking(struct kmem_cache *s, void *object)
{
if (!(s->flags & SLAB_STORE_USER))
return;
set_track(s, object, TRACK_FREE, 0UL);
set_track(s, object, TRACK_ALLOC, 0UL);
}
//下面是slab释放过程
static __always_inline void slab_free(struct kmem_cache *s,
struct page *page, void *x, unsigned long addr)
{
void **object = (void *)x;
struct kmem_cache_cpu *c;
unsigned long tid;
slab_free_hook(s, x);
redo:
/*
* Determine the currently cpus per cpu slab.
* The cpu may change afterward. However that does not matter since
* data is retrieved via this pointer. If we are on the same cpu
* during the cmpxchg then the free will succedd.
*/
preempt_disable();
c = __this_cpu_ptr(s->cpu_slab);
tid = c->tid;
preempt_enable();
//如果object所在的page位于cpu_slab上就将obejct放到cpu本地cache,否则走慢速释放通道
if (likely(page == c->page)) {
set_freepointer(s, object, c->freelist);
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
c->freelist, tid,
object, next_tid(tid)))) {
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
stat(s, FREE_FASTPATH);
} else
__slab_free(s, page, x, addr);
}
static void __slab_free(struct kmem_cache *s, struct page *page,
void *x, unsigned long addr)
{
void *prior;
void **object = (void *)x;
int was_frozen;
struct page new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
unsigned long uninitialized_var(flags);
stat(s, FREE_SLOWPATH);
//开启debug,并且slub有发生溢出的话,不再继续释放
if (kmem_cache_debug(s) &&
!(n = free_debug_processing(s, page, x, addr, &flags)))
return;
do {
if (unlikely(n)) {
spin_unlock_irqrestore(&n->list_lock, flags);
n = NULL;
}
//page不在cpu cache中,那么page一定在node list中
//prior为空,page就在node full list中
prior = page->freelist;
counters = page->counters;
set_freepointer(s, object, prior);//将object加入cpu cache freelist中
new.counters = counters;
was_frozen = new.frozen;
new.inuse--;
if ((!new.inuse || !prior) && !was_frozen) {
if (!kmem_cache_debug(s) && !prior)
/*
* Slab was on no list before and will be partially empty
* We can defer the list move and instead freeze it.
*/
new.frozen = 1;
else { /* Needs to be taken off a list */
n = get_node(s, page_to_nid(page));
/*
* Speculatively acquire the list_lock.
* If the cmpxchg does not succeed then we may
* drop the list_lock without any processing.
*
* Otherwise the list_lock will synchronize with
* other processors updating the list of slabs.
*/
spin_lock_irqsave(&n->list_lock, flags);
}
}
} while (!cmpxchg_double_slab(s, page,
prior, counters,
object, new.counters,
"__slab_free"));
if (likely(!n)) {
/*
* If we just froze the page then put it onto the
* per cpu partial list.
*/
if (new.frozen && !was_frozen) {
put_cpu_partial(s, page, 1);
stat(s, CPU_PARTIAL_FREE);
}
/*
* The list lock was not taken therefore no list
* activity can be necessary.
*/
if (was_frozen)
stat(s, FREE_FROZEN);
return;
}
if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
goto slab_empty;
/*
* Objects left in the slab. If it was not on the partial list before
* then add it.
*/
//object位于node full list的话,将object所在slub先从full list移除再加入partial list
if (kmem_cache_debug(s) && unlikely(!prior)) {
remove_full(s, page);
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
spin_unlock_irqrestore(&n->list_lock, flags);
return;
slab_empty:
//slab 从node list移除
if (prior) {
/*
* Slab on the partial list.
*/
remove_partial(n, page);
stat(s, FREE_REMOVE_PARTIAL);
} else
/* Slab must be on the full list */
remove_full(s, page);
spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
discard_slab(s, page);//将page释放到伙伴系统
}
最后说明下,对于kmalloc跟kfree来说,这两个接口的实现同样是基于slub分配器实现的。
void *__kmalloc(size_t size, gfp_t flags)
{
struct kmem_cache *s;
void *ret;
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
return kmalloc_large(size, flags);
s = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
ret = slab_alloc(s, flags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
kasan_kmalloc(s, ret, size);
return ret;
}