slub内存管理的4个主要接口函数如下(参考kernel-4.19代码):
//slab缓存的创建
struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *));
//slab object的分配
void *kmem_cache_alloc(struct kmem_cache *cachep, int flags);
//slab object的释放
void kmem_cache_free(struct kmem_cache *cachep, void *objp);
//slab缓存的释放
void kmem_cache_destroy(struct kmem_cache *);
本篇主要介绍slab object分配的函数kmem_cache_alloc
一、函数关系调用图
1、kmem_cache_alloc函数调用关系图
2、两个路径分配对应的object分配情况说明
2.1 fastpath
如果c->freelist不为空,则走的是fastpath,直接从c->freelist分配一个空闲object,同是将c->freelist指向下一个空闲object,状态:ALLOC_FASTPATH++。如果c->freelist为NULL,则跳转到slowpath。
2.2 slowpath-1
如果c->freelist为空,slowpath首先判断c->page->freelist,如果不为空,走slowpath-1,从page->freelist中得到空闲object,分配出去,再利用get_freepointer更新c->freelist。代码流程:get_freelist->get_freepointer。状态:ALLOC_REFILL++,ALLOC_SLOWPATH++
2.3 slowpath-2
如果c-page为NULL,则无法走slowpath-1,若此时c->partial不为空,则走slowpath-2,将c->partial链表中的第一个page及对应的slab迁移到c->page中。然后在重新走一遍slowpath-1。代码流程:slub_percpu_partial->get_freelist->get_freepointer。状态:CPU_PARTIAL_ALLOC++,ALLOC_REFILL++,ALLOC_SLOWPATH++
2.4 slowpath-3
如果c->freelist,page->freelist和c->partial均为NULL,则从s->node->partial链表中找寻空闲object。如果能够找到,则进入slowpath-3。这个慢速路径分配会从Node管理的partial链表中迁移部分slab到c->partial中,同时更新c->page和c->freelist。同时,会判断当前迁移的空闲object对象数目是否超过s->cpu_partial的一半,如果是,停止继续迁移到c->partial。代码流程:new_slab_objects->get_partial->get_partial_node(get_any_partial)->get_freepointer。状态:ALLOC_FROM_PARTIAL++,CPU_PARTIAL_NODE++,ALLOC_SLOWPATH++
2.5 slowpath-4
如果Node partial中无法得到空闲的object,那么只能从Buddy system分配新的页面,根据alloc_gfp和s->oo,初始化struct page,然后直接添加到c->page,更新c->freelist。代码流程:new_slab_objects->new_slab->allocate_slab->alloc_slab_page->get_freepointer。状态:ALLOC_SLAB++,ALLOC_SLOWPATH++。
kmem_cache_alloc函数主要调用slab_alloc函数和trace_kmem_cache_alloc函数,其中核心函数是slab_alloc函数,其直接调用slab_allco_node函数来实现slab缓存 object的分配;trace_kmem_cache_alloc函数会记录分配slab object时的信息,用于debug调试,是ftrace。下面重点介绍slab_allco_node函数
include/linux/kernel.h
//gcc的编译特性,使用__builtin_return_address(level)打印出一个函数的堆栈地址。
//其中level代表是堆栈中第几层调用地址,__builtin_return_address(0)表示第一层调用地址,即当前函数,__builtin_return_address(1)表示第二层
#define _RET_IP_ (unsigned long)__builtin_return_address(0)
mm/slub.c
static __always_inline void *slab_alloc(struct kmem_cache *s,
gfp_t gfpflags, unsigned long addr)
{
return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
}
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
//核心函数
void *ret = slab_alloc(s, gfpflags, _RET_IP_);
//ftrace,用于debug调试用,会记录slab分配的信息
trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
s->size, gfpflags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc);
二、slab_allco_node函数
/*
* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
* have the fastpath folded into their functions. So no function call
* overhead for requests that can be satisfied on the fastpath.
*
* The fastpath works by first checking if the lockless freelist can be used.
* If not then __slab_alloc is called for slow processing.
*
* Otherwise we can simply pick the next object from the lockless free list.
*/
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr)
{
void *object;
struct kmem_cache_cpu *c;
struct page *page;
unsigned long tid;
s = slab_pre_alloc_hook(s, gfpflags);//分配object之前,对slab缓存进行预处理
if (!s)
return NULL;
redo:
/*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
* enabled. We may switch back and forth between cpus while
* reading from one cpu area. That does not matter as long
* as we end up on the original cpu again when doing the cmpxchg.
*
* We should guarantee that tid and kmem_cache are retrieved on
* the same cpu. It could be different if CONFIG_PREEMPT so we need
* to check if it is matched or not.
*/
//确保tid和kmem_cache_cpu是同一个CPU的,如果开了抢占同时tid不等于当前CPU的tid,则继续循环
do {
tid = this_cpu_read(s->cpu_slab->tid);
c = raw_cpu_ptr(s->cpu_slab);
} while (IS_ENABLED(CONFIG_PREEMPT) &&
unlikely(tid != READ_ONCE(c->tid)));
/*
* Irqless object alloc/free algorithm used here depends on sequence
* of fetching cpu_slab's data. tid should be fetched before anything
* on c to guarantee that object and page associated with previous tid
* won't be used with current tid. If we fetch tid first, object and
* page could be one associated with next tid and our alloc/free
* request will be failed. In this case, we will retry. So, no problem.
*/
//编译时的隔离屏障,防止汇编指令乱序,确保后面object和page是当前CPU的,而不是其他CPU的
barrier();
/*
* The transaction ids are globally unique per cpu and per operation on
* a per cpu queue. Thus they can be guarantee that the cmpxchg_double
* occurs on the right processor and that there was no operation on the
* linked list in between.
*/
object = c->freelist;//获得当前cpu的空闲对象列表 freelist指向第一个空闲的object
page = c->page;//获取当前cpu使用的page
//当前CPU的slab空闲列表为空或者当前slab使用的page对应的node不是NUMA_NO_NODE,需要重新分配slab对象。根据前面输入参数,默认都是NUMA_NO_NODE
//从这里判断进入slowpath还是fastpath
if (unlikely(!object || !node_match(page, node))) {
//进入慢速路径分配,核心函数__slab_alloc
object = __slab_alloc(s, gfpflags, node, addr, c);
//slowpath的状态更新ALLOC_SLOWPATH++,后面free会自减
stat(s, ALLOC_SLOWPATH);
} else {
//进入快速路径分配
//获取下一个空闲对象地址(object +s->offset),一个原因:因为采用指针内置式时,object开始还没赋值内容,
//所以利用前8字节保存下一个空闲对象的偏移量,而下面开始要对object的内容进行操作,所以要提前取出来,避免被修改
void *next_object = get_freepointer_safe(s, object);
/*
* The cmpxchg will only match if there was no additional
* operation and if we are on the right processor.
*
* The cmpxchg does the following atomically (without lock
* semantics!)
* 1. Relocate first pointer to the current per cpu area.
* 2. Verify that tid and freelist have not been changed
* 3. If they were not changed replace tid and freelist
*
* Since this is without lock semantics the protection is only
* against code executing on this cpu *not* from access by
* other cpus.
*/
/*原子操作,主要做了三件事:
(1)重定向首指针指向当前CPU空间;
(2)判断tid和freelist未被修改;
(3)如果未被修改,则此次slab缓存object分配未被CPU迁移,那么就用新的tid和freelist覆盖旧的数据:
s->cpu_slab->freelist = next_object
s->cpu_slab->tid = next_tid(tid)
此时c->freelist就指向了新的下一个空闲对象
*/
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
object, tid,
next_object, next_tid(tid)))) {
//获取空闲对象失败,则经note_cmpxchg_failure()记录日志后重回redo标签再次尝试分配
note_cmpxchg_failure("slab_alloc", s, tid);
goto redo;
}
//刷新结构体数据,提前将object +s->offset放入CPU 缓存中,提高缓存命中率
prefetch_freepointer(s, next_object);
stat(s, ALLOC_FASTPATH);
}
//如果下一个空闲对象是最后一个为空闲对象,那么将下一个空闲对象中存放下个obj地址的位置上面的内容置为0(s->offset),一般是前8个字节,指针内置式的时候
maybe_wipe_obj_freeptr(s, object);
//如果gfpflags是__GFP_ZERO,将object全部初始化为0
if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
memset(object, 0, s->object_size);
//对分配后的object进行一些tag处理,涉及kmemleak和kasan
slab_post_alloc_hook(s, gfpflags, 1, &object);
return object;
}
2.1 slab_pre_alloc_hook函数
include/linux/gfp.h
/*
* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
* GFP flags are used before interrupts are enabled. Once interrupts are
* enabled, it is set to __GFP_BITS_MASK while the system is running. During
* hibernation, it is used by PM to avoid I/O during memory allocation while
* devices are suspended.
*/
//gfp_allowed_mask在启动早期阶段(中断未使能前),标志是GFP_BOOT_MASK;在中断起来后,设置为__GFP_BITS_MASK,在这里就是__GFP_BITS_MASK
extern gfp_t gfp_allowed_mask;
mm/slab.h
static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
gfp_t flags)
{
//GFP代码include/linux/gfp.h 如果为GFP_KERNEL,则为(0x60 00c0)
flags &= gfp_allowed_mask;
//只有开启lockdef 死锁检测下面两个函数才有用
fs_reclaim_acquire(flags);
fs_reclaim_release(flags);
//如上三步是进行标志位的check
//在分配slab object时,线程可能会睡眠。比如GFP_KERNEL,这个GFP就会允许睡眠,及时的将CPU资源让给其他线程,例子:当分配slab 对象需要重新从buddy system重新获取page,然后创建一个slab缓存,再将object分配出去时,耗时会比较长,此时可以将CPU让出,让其他线程使用,当object可以获得时,再唤醒
might_sleep_if(gfpflags_allow_blocking(flags));
//should_failslab正常是返回0
if (should_failslab(s, flags))
return NULL;
//如果使能了memcg_kmem,且flag和s->flags其中一个满足,则会执行
//将kmem_cache结构指针转换为属于mcgroup组的kmem_cache指针
if (memcg_kmem_enabled() &&
((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
return memcg_kmem_get_cache(s);
//如果未使能则返回
return s;
}
2.2 object慢速分配的核心函数:__slab_alloc函数和___slab_alloc函数
__slab_alloc函数
mm/slub.c
/*
* Another one that disabled interrupt and compensates for possible
* cpu changes by refetching the per cpu area pointer.
*/
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *p;
unsigned long flags;
//禁止当前处理器的中断并且将它们之前的状态记录在flags中
local_irq_save(flags);
#ifdef CONFIG_PREEMPT
/*
* We may have been preempted and rescheduled on a different
* cpu before disabling interrupts. Need to reload cpu area
* pointer.
*/
//如果开启的抢占,为了避免因调度切换到不同的CPU,通过this_cpu_ptr()重新获取s对应的CPU域的指针
c = this_cpu_ptr(s->cpu_slab);
#endif
//核心函数
p = ___slab_alloc(s, gfpflags, node, addr, c);
//根据之前记录状态的flags,恢复当前处理器的中断
local_irq_restore(flags);
return p;
}
下面开始介绍___slab_alloc函数函数
mm/slub.c
最重要的正主来了。。。。
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *freelist;
struct page *page;
//得到slab缓存中cpu_slab中的page指针,指向正在使用的slab page
page = c->page;
//如果c->page为空,即当前cpu 本地 slab为空,不存在,那么跳转到new_slab分支新分配一个
if (!page) {
/*
* if the node is not online or has no normal memory, just
* ignore the node constraint
*/
//判断node是否是NUMA_NO_NODE,根据输入参数,默认就是NUMA_NO_NODE
if (unlikely(node != NUMA_NO_NODE &&
!node_state(node, N_NORMAL_MEMORY)))
node = NUMA_NO_NODE;
goto new_slab;
}
redo:
//如果c->page不为空,由于page不为空,且node是NUMA_NO_NODE,所以node_match返回1,这个if语句不会执行
if (unlikely(!node_match(page, node))) {
/*
* same as above but node_match() being false already
* implies node != NUMA_NO_NODE
*/
//跟上面一样如果node_match返回0,则node不是NUMA_NO_NODE,这里将其置为NUMA_NO_NODE
if (!node_state(node, N_NORMAL_MEMORY)) {
node = NUMA_NO_NODE;
goto redo;
} else {
stat(s, ALLOC_NODE_MISMATCH);
//如果页面与节点不匹配就通过deactivate_slab()去激活cpu本地slab
// 如果CPU正在使用的slab不属于当前的节点,或者如果page不具有PF_MEMALLOC,而且GFP是__GFP_NOMEMALLOC,就会将slab移除,放到node的partial list中,再通过new_slab分配一个新的page
deactivate_slab(s, page, c->freelist, c);
goto new_slab;
}
}
/*
* By rights, we should be searching for a slab page that was
* PFMEMALLOC but right now, we are losing the pfmemalloc
* information when the page leaves the per-cpu allocator
*/
//如果page不具有PF_MEMALLOC,而且GFP是__GFP_NOMEMALLOC,则要继续走deactivate_slab函数,再通过new_slab分配一个新的page
if (unlikely(!pfmemalloc_match(page, gfpflags))) {
deactivate_slab(s, page, c->freelist, c);
goto new_slab;
}
/* must check again c->freelist in case of cpu migration or IRQ */
/*
再次检查空闲对象指针freelist是否为空,避免在禁止本地处理器中断前因发生了CPU迁移或者中断,
导致本地的空闲对象指针不为空,正常根据进入_slab_alloc的条件,第一次执行这里,这个freelist是NULL。后面的各种slowpath对应的跳变,它可能不是NULL
*/
freelist = c->freelist;
if (freelist)
//如果不为空,则跳转到load_freelist,得到一个空闲object
goto load_freelist;
//slowpath-1 如果freelist为空,则从page中得到一个freelist,同时将page->freelist置为NULL
freelist = get_freelist(s, page);
//如果获取的freelist为NULL,此时走new_slab,更新慢路径申请对象的DEACTIVATE_BYPASS统计信息
if (!freelist) {
c->page = NULL;
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
//如果获取的不为空,则更新慢路径申请对象的ALLOC_REFILL统计信息,进入load_freelist,slowpath-1
stat(s, ALLOC_REFILL);
load_freelist:
/*
* freelist is pointing to the list of objects to be used.
* page is pointing to the page from which the objects are obtained.
* That page must be frozen for per cpu allocations to work.
*/
//按照规则,这个frozen一般是1(来自于CPU partial,或者本地CPU slab缓存),来自node partial,则frozen为0,不过当object来自Node partial时,对应的page脱离node partial, 挂到c->page上,frozen状态也会刷新为1
VM_BUG_ON(!c->page->frozen);
//更新CPU 的freelist 指向下一个空闲的object
c->freelist = get_freepointer(s, freelist);
//更新tid,然后返回分配的空闲object
c->tid = next_tid(c->tid);
return freelist;
new_slab:
//slowpath-2 如果从c->page分配失败(slowpath-1)且c->partial不为空,则从CPU partial中取出一个page以及page对应的slab
if (slub_percpu_partial(c)) {
page = c->page = slub_percpu_partial(c);
slub_set_percpu_partial(c, page);//c->partial = page->next
//更新慢路径申请对象的CPU_PARTIAL_ALLOC统计信息,重新进入redo
stat(s, CPU_PARTIAL_ALLOC);
goto redo;
}
//slowpath-3和slowpath-4 如果partial为空,意味着当前CPU partial所有的slab都已经满负荷使用,那么则需使用new_slab_objects()分配新的slab,这里涉及到从node partial或者从buddy system从得到新的slab,然后更新c->freelist
freelist = new_slab_objects(s, gfpflags, node, &c);
if (unlikely(!freelist)) {
//如果从node partial和buddy system中未能分配成功,则slab_out_of_memory记录日志(从kernellog中可以看到)后返回NULL表示申请失败
slab_out_of_memory(s, gfpflags, node);
return NULL;
}
page = c->page;
//如果分配成功,判断是否未开启调试且page属性及是否匹配pfmemalloc,如果未开启slub debug且匹配pfmemalloc,则跳转到load_freelist
//否则继续往下走
if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
goto load_freelist;
//如果开启slub debug,kmem_cache_debug返回1;但是debug初始化失败,则跳转new_slab继续重新分配
/* Only entered in the debug case */
if (kmem_cache_debug(s) &&
!alloc_debug_processing(s, page, freelist, addr))
goto new_slab; /* Slab failed checks. Next slab needed */
//deactivate_slab去激活cpu本地slab,
deactivate_slab(s, page, get_freepointer(s, freelist), c);
//最终返回slowpath分配的c->freelist
return freelist;
}
2.2.1 pfmemalloc_match(page, gfpflags)函数
mm/internal.h
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN WMARK_MIN
#define ALLOC_WMARK_LOW WMARK_LOW
#define ALLOC_WMARK_HIGH WMARK_HIGH
#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
/*
* Only MMU archs have async oom victim reclaim - aka oom_reaper so we
* cannot assume a reduced access to memory reserves is sufficient for
* !MMU
*/
#ifdef CONFIG_MMU
#define ALLOC_OOM 0x08
#else
#define ALLOC_OOM ALLOC_NO_WATERMARKS
#endif
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
mm/page_alloc.c
static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
{
//如果GFP是__GFP_NOMEMALLOC,则返回0,此时就会执行deactivate_slab函数,进入new_slab
//其余标志都是根据如上信息,都是非0的
if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
return 0;
if (gfp_mask & __GFP_MEMALLOC)
return ALLOC_NO_WATERMARKS;
if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
return ALLOC_NO_WATERMARKS;
if (!in_interrupt()) {
if (current->flags & PF_MEMALLOC)
return ALLOC_NO_WATERMARKS;
else if (oom_reserves_allowed(current))
return ALLOC_OOM;
}
return 0;
}
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
{
return !!__gfp_pfmemalloc_flags(gfp_mask);
}
mm/slub.c
static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
{
//不具有PF_MEMALLOC属性,会进行mask的判断
if (unlikely(PageSlabPfmemalloc(page)))
return gfp_pfmemalloc_allowed(gfpflags);
//如果当前page具有PF_MEMALLOC属性,则直接返回TRUE
return true;
}
2.2.2 new_slab_objects(s, gfpflags, node, &c)函数
mm/slub.c
/*1.1.1
* Put a page that was just frozen (in __slab_free) into a partial page
* slot if available.
*
* If we did not find a slot then simply move all the partials to the
* per node partial list.
*/
static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct page *oldpage;
int pages;
int pobjects;
//这个操作要关闭抢占
preempt_disable();
do {
pages = 0;
pobjects = 0;
//这oldpage是CPU partial链表第一个page,相比链表后面的page,需要多维护pobjects信息(当前CPU partial中空闲object数目)和pages信息(当前CPU partial中slab数目)
oldpage = this_cpu_read(s->cpu_slab->partial);
if (oldpage) {
pobjects = oldpage->pobjects;
pages = oldpage->pages;
//根据输入参数,在进行obj alloc时,drain为0,正常不会执行;如果会执行(object free),如果空闲object数目大于s->cpu_partial
//则表示当前CPU partial链表已经满了,需要将此时CPU partial链表的所有page,全部移到per node partial上
if (drain && pobjects > s->cpu_partial) {
unsigned long flags;
/*
* partial array is full. Move the existing
* set to the per node partial list.
*/
local_irq_save(flags);
//将CPU partial链表中所有page,添加到这个page对应node partial的尾部,将frozen置为0,
//如果n->nr_partial >= s->min_partial,则将这个page添加到discard_page里面,然后统一释放这些page
unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
local_irq_restore(flags);
oldpage = NULL;
pobjects = 0;
pages = 0;
//更新CPU_PARTIAL_DRAIN状态信息
stat(s, CPU_PARTIAL_DRAIN);
}
}
//在alloc时,走的是这里,更新pages和pobjects
pages++;
pobjects += page->objects - page->inuse;
//将新来的page添加到CPU partial的链表首部,
page->pages = pages;
page->pobjects = pobjects;
page->next = oldpage;
//如果上面page成功添加到CPU partial,this_cpu_cmpxchg函数此时返回的oldpage,不是page,则退出循环
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
!= oldpage);
//如果s->cpu_partial为空,说明这个CPU partial里面没有空闲的object,按照规则,会将其从CPU partial中脱离,将里面的page挂到对应的node partial上面,
//这个在文章slub allocator工作原理里面有
if (unlikely(!s->cpu_partial)) {
unsigned long flags;
local_irq_save(flags);
unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
local_irq_restore(flags);
}
preempt_enable();
#endif
}
/*1.1
* Try to allocate a partial slab from a specific node.
*/
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
struct kmem_cache_cpu *c, gfp_t flags)
{
struct page *page, *page2;
void *object = NULL;
unsigned int available = 0;
int objects;
/*
* Racy check. If we mistakenly see no partial slabs then we
* just allocate an empty slab. If we mistakenly try to get a
* partial slab and there is none available then get_partials()
* will return NULL.
*/
//如果n或者n->nr_partial为空,则返回NULL
if (!n || !n->nr_partial)
return NULL;
spin_lock(&n->list_lock);
//循环访问链表,利用变量page遍历节点n->partial指向的page链表
list_for_each_entry_safe(page, page2, &n->partial, lru) {
void *t;
//如果当前page不匹配pfmemalloc属性则继续循环
if (!pfmemalloc_match(page, flags))
continue;
//返回一个freelist指针,指向空闲的object或者NULL;函数会将page从对应的Node partial中移除,更新n->partial,将该page的frozen从0置为1,objects表示这个page对应的slab缓存中空闲的object数目
t = acquire_slab(s, n, page, object == NULL, &objects);
//如果访问到链表末尾,此时t为NULL,则break
if (!t)
break;
available += objects;//统计所有符合要求的page对应的slab缓存中空闲object数目
//第一次遍历时object初始化为空,走if,后面object更新为t,走else
if (!object) {
//第一次会将这个page添加到本地cpu缓存对应的c->page中,并更新慢路径申请对象的ALLOC_FROM_PARTIAL统计信息
c->page = page;
stat(s, ALLOC_FROM_PARTIAL);
object = t;//将t赋给object,更新freelist
} else {
//1.1.1 将这个page添加到CPU partial中,并更新慢路径申请对象的CPU_PARTIAL_NODE统计信息
put_cpu_partial(s, page, 0);
stat(s, CPU_PARTIAL_NODE);
}
//如果使能了SLUB_CPU_PARTIAL(默认使能), kmem_cache_has_cpu_partial返回非0,否则返回0。返回0,则取反为1,根据逻辑操作的短路特性,直接break,退出循环
//如果未使能,此时会判断当前迁移的空闲object对象数目是否超过s->cpu_partial的一半,如果是,则break,否则继续循环
if (!kmem_cache_has_cpu_partial(s)
|| available > slub_cpu_partial(s) / 2)
break;
}
spin_unlock(&n->list_lock);
//返回c->freelist指针
return object;
}
/*1
* Get a partial page, lock it and return it.
*/
static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
struct kmem_cache_cpu *c)
{
void *object;
int searchnode = node;
if (node == NUMA_NO_NODE)
//返回找到该CPU指定的本地内存节点,实际上得到一个索引
searchnode = numa_mem_id();
else if (!node_present_pages(node))
searchnode = node_to_mem_node(node);
//get_node根据searchnode,得到s->node[searchnode]
//1.1 get_partial_node从指定的node(s->node[searchnode]),迁移部分slab到CPU partial,同时更新c->page,并返回c->freelist
object = get_partial_node(s, get_node(s, searchnode), c, flags);
//如果从node partial中成功分配,则object不应该为空,直接返回,根据输入参数node本身就是NUMA_NO_NODE
if (object || node != NUMA_NO_NODE)
return object;
//从s->node中非searchnode里面找寻合适的节点,分配object,同样也会迁移部分slab到CPU partial,更新c->freelist和c->page,并返回c->freelist
//不过是否执行,得判断。第一,是否是NUMA系统,只有NUMA系统才会进行下去,非NUMA,直接返回NULL;第二,s->remote_node_defrag_ratio值是否足够大,因为该值越小,越倾向于从本节点(searchnode )分配对象
return get_any_partial(s, flags, c);
}
//2.1 从buddy system中获取到page,并初始化page的一些参数
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
struct kmem_cache_order_objects oo = s->oo;//赋值
gfp_t alloc_gfp;
void *start, *p;
int idx, order;
bool shuffle;
flags &= gfp_allowed_mask;
if (gfpflags_allow_blocking(flags))
local_irq_enable();
flags |= s->allocflags;
/*
* Let the initial higher-order allocation fail under memory pressure
* so we fall-back to the minimum order allocation.
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
//根据alloc_gfp和oo里面的order,得到page
page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
//如果按照s->oo无法分配到page,就按照s->min标准(slab里面只有一个object),继续尝试
oo = s->min;
alloc_gfp = flags;
/*
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
page = alloc_slab_page(s, alloc_gfp, node, oo);
//如果依旧无法分配到page,直接跳到out
if (unlikely(!page))
goto out;
//否则,按照oo=s->min标准,成功分配后,更新状态ORDER_FALLBACK
stat(s, ORDER_FALLBACK);
}
//成功得到page后,更新page->objects参数(slab中object总数)
page->objects = oo_objects(oo);
order = compound_order(page);
page->slab_cache = s;
__SetPageSlab(page);
if (page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
//获取page的虚拟地址
start = page_address(page);
//如果开启了kasan,tag的初始化
if (unlikely(s->flags & SLAB_POISON))
memset(start, POISON_INUSE, PAGE_SIZE << order);
kasan_poison_slab(page);
//如果未使能CONFIG_SLAB_FREELIST_RANDOM,直接返回FALSE,如果使能了,会对page->freelist指向的空闲object链表进行随机化排序,增加安全性,降低可预测性
shuffle = shuffle_freelist(s, page);
//如果shuffle为FALSE,则使用下面方式对空闲object链表进行随机化
if (!shuffle) {
for_each_object_idx(p, idx, s, start, page->objects) {
setup_object(s, page, p);
if (likely(idx < page->objects))
set_freepointer(s, p, p + s->size);
else
set_freepointer(s, p, NULL);//最后一个空闲对象next指针设为NULL
}
page->freelist = fixup_red_left(s, start);
}
//更新参数,page刚创建的时候page->inuse等于object的个数;因为将添加到c->page里面,frozen置为1
page->inuse = page->objects;
page->frozen = 1;
out:
if (gfpflags_allow_blocking(flags))
local_irq_disable();
if (!page)
return NULL;
mod_lruvec_page_state(page,
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1 << oo_order(oo));
inc_slabs_node(s, page_to_nid(page), page->objects);
//返回page
return page;
}
//2 从buddy system中获取到page,并初始化page的一些参数
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
//如果flags里面有GFP_SLAB_BUG_MASK标志位,则打印warn信息,dump stack,不会从buddy system中分配page,更新slab缓存
if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
flags &= ~GFP_SLAB_BUG_MASK;
pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
invalid_mask, &invalid_mask, flags, &flags);
dump_stack();
}
//2.1 正常而且默认是走这里,核心函数
return allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
}
mm/slub.c
//正主。。。。。。。
static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
int node, struct kmem_cache_cpu **pc)
{
void *freelist;
struct kmem_cache_cpu *c = *pc;
struct page *page;
WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
//1、slowpath-3 从Node 管理的partial链表中迁移部分slab 到cpu partial链表中,同时更新c->page,并返回c->freelist
freelist = get_partial(s, flags, node, c);
//如果从node partial中分配到了,则直接返回
if (freelist)
return freelist;
//2、slowpath-4 否则,调用new_slab函数创建一个slab,从buddy system中获取到page,并初始化page的一些参数
page = new_slab(s, flags, node);
//如果成功从buddy system中得到page
if (page) {
//得到s对应的CPU
c = raw_cpu_ptr(s->cpu_slab);
if (c->page)
//如果c->page不为空,由于此时需要将从buddy system中的page,给到per cpu,那么就要需要原有的c->page移除即取消CPU与slab的关联 ,添加到node partial中,将c->page和c->freelist置为NULL
//一般c->page不为空有如下两种情况,
/*当gfpflags置位__GFP_WAIT,在new_slab()期间,中断被打开,可能出现两种情况:
1.其他的内核路径执行了__slab_alloc(),导致当前CPU关联上其他的slab
2.线程切换CPU,有可能现在的代码已经不在先前运行的CPU上执行*/
flush_slab(s, c);
/*
* No other reference to the page yet so we can
* muck around with it freely without cmpxchg
*/
//更新c->freelist和c->page
freelist = page->freelist;
page->freelist = NULL;
//slowpath-4 更新慢路径申请对象的CPU_PARTIAL_NODE统计信息
stat(s, ALLOC_SLAB);
c->page = page;
*pc = c;
} else
//如果未成功从buddy system中得到page,c->freelist置为NULL
freelist = NULL;
return freelist;
}