SLUB内存管理的4个主要接口函数介绍(2)

        slub内存管理的4个主要接口函数如下(参考kernel-4.19代码):

//slab缓存的创建
struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *));
//slab object的分配
void *kmem_cache_alloc(struct kmem_cache *cachep, int flags);
//slab object的释放
void kmem_cache_free(struct kmem_cache *cachep, void *objp);
//slab缓存的释放
void kmem_cache_destroy(struct kmem_cache *);

本篇主要介绍slab object分配的函数kmem_cache_alloc

一、函数关系调用图

1、kmem_cache_alloc函数调用关系图

2、两个路径分配对应的object分配情况说明

2.1 fastpath 

        如果c->freelist不为空,则走的是fastpath,直接从c->freelist分配一个空闲object,同是将c->freelist指向下一个空闲object,状态:ALLOC_FASTPATH++。如果c->freelist为NULL,则跳转到slowpath。

2.2 slowpath-1

        如果c->freelist为空,slowpath首先判断c->page->freelist,如果不为空,走slowpath-1,从page->freelist中得到空闲object,分配出去,再利用get_freepointer更新c->freelist。代码流程:get_freelist->get_freepointer。状态:ALLOC_REFILL++,ALLOC_SLOWPATH++

2.3 slowpath-2

        如果c-page为NULL,则无法走slowpath-1,若此时c->partial不为空,则走slowpath-2,将c->partial链表中的第一个page及对应的slab迁移到c->page中。然后在重新走一遍slowpath-1。代码流程:slub_percpu_partial->get_freelist->get_freepointer。状态:CPU_PARTIAL_ALLOC++,ALLOC_REFILL++,ALLOC_SLOWPATH++

2.4 slowpath-3

        如果c->freelist,page->freelist和c->partial均为NULL,则从s->node->partial链表中找寻空闲object。如果能够找到,则进入slowpath-3。这个慢速路径分配会从Node管理的partial链表中迁移部分slab到c->partial中,同时更新c->page和c->freelist。同时,会判断当前迁移的空闲object对象数目是否超过s->cpu_partial的一半,如果是,停止继续迁移到c->partial。代码流程:new_slab_objects->get_partial->get_partial_node(get_any_partial)->get_freepointer。状态:ALLOC_FROM_PARTIAL++,CPU_PARTIAL_NODE++,ALLOC_SLOWPATH++

2.5 slowpath-4

        如果Node partial中无法得到空闲的object,那么只能从Buddy system分配新的页面,根据alloc_gfp和s->oo,初始化struct page,然后直接添加到c->page,更新c->freelist。代码流程:new_slab_objects->new_slab->allocate_slab->alloc_slab_page->get_freepointer。状态:ALLOC_SLAB++,ALLOC_SLOWPATH++。

        kmem_cache_alloc函数主要调用slab_alloc函数和trace_kmem_cache_alloc函数,其中核心函数是slab_alloc函数,其直接调用slab_allco_node函数来实现slab缓存 object的分配;trace_kmem_cache_alloc函数会记录分配slab object时的信息,用于debug调试,是ftrace。下面重点介绍slab_allco_node函数

include/linux/kernel.h
//gcc的编译特性,使用__builtin_return_address(level)打印出一个函数的堆栈地址。
//其中level代表是堆栈中第几层调用地址,__builtin_return_address(0)表示第一层调用地址,即当前函数,__builtin_return_address(1)表示第二层
#define _RET_IP_		(unsigned long)__builtin_return_address(0)

mm/slub.c
static __always_inline void *slab_alloc(struct kmem_cache *s,
		gfp_t gfpflags, unsigned long addr)
{
	return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
}

void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
//核心函数
	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
//ftrace,用于debug调试用,会记录slab分配的信息
	trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
				s->size, gfpflags);

	return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc);

二、slab_allco_node函数

/*
 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
 * have the fastpath folded into their functions. So no function call
 * overhead for requests that can be satisfied on the fastpath.
 *
 * The fastpath works by first checking if the lockless freelist can be used.
 * If not then __slab_alloc is called for slow processing.
 *
 * Otherwise we can simply pick the next object from the lockless free list.
 */
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
		gfp_t gfpflags, int node, unsigned long addr)
{
	void *object;
	struct kmem_cache_cpu *c;
	struct page *page;
	unsigned long tid;

	s = slab_pre_alloc_hook(s, gfpflags);//分配object之前,对slab缓存进行预处理
	if (!s)
		return NULL;
redo:
	/*
	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
	 * enabled. We may switch back and forth between cpus while
	 * reading from one cpu area. That does not matter as long
	 * as we end up on the original cpu again when doing the cmpxchg.
	 *
	 * We should guarantee that tid and kmem_cache are retrieved on
	 * the same cpu. It could be different if CONFIG_PREEMPT so we need
	 * to check if it is matched or not.
	 */
//确保tid和kmem_cache_cpu是同一个CPU的,如果开了抢占同时tid不等于当前CPU的tid,则继续循环
	do {
		tid = this_cpu_read(s->cpu_slab->tid);
		c = raw_cpu_ptr(s->cpu_slab);
	} while (IS_ENABLED(CONFIG_PREEMPT) &&
		 unlikely(tid != READ_ONCE(c->tid)));

	/*
	 * Irqless object alloc/free algorithm used here depends on sequence
	 * of fetching cpu_slab's data. tid should be fetched before anything
	 * on c to guarantee that object and page associated with previous tid
	 * won't be used with current tid. If we fetch tid first, object and
	 * page could be one associated with next tid and our alloc/free
	 * request will be failed. In this case, we will retry. So, no problem.
	 */
//编译时的隔离屏障,防止汇编指令乱序,确保后面object和page是当前CPU的,而不是其他CPU的
	barrier();

	/*
	 * The transaction ids are globally unique per cpu and per operation on
	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
	 * occurs on the right processor and that there was no operation on the
	 * linked list in between.
	 */

	object = c->freelist;//获得当前cpu的空闲对象列表 freelist指向第一个空闲的object
	page = c->page;//获取当前cpu使用的page
//当前CPU的slab空闲列表为空或者当前slab使用的page对应的node不是NUMA_NO_NODE,需要重新分配slab对象。根据前面输入参数,默认都是NUMA_NO_NODE
//从这里判断进入slowpath还是fastpath
	if (unlikely(!object || !node_match(page, node))) {
//进入慢速路径分配,核心函数__slab_alloc
		object = __slab_alloc(s, gfpflags, node, addr, c);
//slowpath的状态更新ALLOC_SLOWPATH++,后面free会自减
		stat(s, ALLOC_SLOWPATH);
	} else {
//进入快速路径分配
//获取下一个空闲对象地址(object +s->offset),一个原因:因为采用指针内置式时,object开始还没赋值内容,
//所以利用前8字节保存下一个空闲对象的偏移量,而下面开始要对object的内容进行操作,所以要提前取出来,避免被修改
		void *next_object = get_freepointer_safe(s, object);

		/*
		 * The cmpxchg will only match if there was no additional
		 * operation and if we are on the right processor.
		 *
		 * The cmpxchg does the following atomically (without lock
		 * semantics!)
		 * 1. Relocate first pointer to the current per cpu area.
		 * 2. Verify that tid and freelist have not been changed
		 * 3. If they were not changed replace tid and freelist
		 *
		 * Since this is without lock semantics the protection is only
		 * against code executing on this cpu *not* from access by
		 * other cpus.
		 */
/*原子操作,主要做了三件事:
      (1)重定向首指针指向当前CPU空间;
      (2)判断tid和freelist未被修改;
      (3)如果未被修改,则此次slab缓存object分配未被CPU迁移,那么就用新的tid和freelist覆盖旧的数据:
            s->cpu_slab->freelist = next_object
            s->cpu_slab->tid = next_tid(tid)
            此时c->freelist就指向了新的下一个空闲对象
*/
		if (unlikely(!this_cpu_cmpxchg_double(
				s->cpu_slab->freelist, s->cpu_slab->tid,
				object, tid,
				next_object, next_tid(tid)))) {
//获取空闲对象失败,则经note_cmpxchg_failure()记录日志后重回redo标签再次尝试分配 
			note_cmpxchg_failure("slab_alloc", s, tid);
			goto redo;
		}
//刷新结构体数据,提前将object +s->offset放入CPU 缓存中,提高缓存命中率
		prefetch_freepointer(s, next_object);
		stat(s, ALLOC_FASTPATH);
	}
//如果下一个空闲对象是最后一个为空闲对象,那么将下一个空闲对象中存放下个obj地址的位置上面的内容置为0(s->offset),一般是前8个字节,指针内置式的时候
	maybe_wipe_obj_freeptr(s, object);
//如果gfpflags是__GFP_ZERO,将object全部初始化为0
	if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
		memset(object, 0, s->object_size);
//对分配后的object进行一些tag处理,涉及kmemleak和kasan
	slab_post_alloc_hook(s, gfpflags, 1, &object);

	return object;
}

2.1 slab_pre_alloc_hook函数

include/linux/gfp.h
/*
 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
 * GFP flags are used before interrupts are enabled. Once interrupts are
 * enabled, it is set to __GFP_BITS_MASK while the system is running. During
 * hibernation, it is used by PM to avoid I/O during memory allocation while
 * devices are suspended.
 */
//gfp_allowed_mask在启动早期阶段(中断未使能前),标志是GFP_BOOT_MASK;在中断起来后,设置为__GFP_BITS_MASK,在这里就是__GFP_BITS_MASK 
extern gfp_t gfp_allowed_mask;



mm/slab.h
static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
						     gfp_t flags)
{
//GFP代码include/linux/gfp.h 如果为GFP_KERNEL,则为(0x60 00c0)
	flags &= gfp_allowed_mask;
//只有开启lockdef 死锁检测下面两个函数才有用
	fs_reclaim_acquire(flags);
	fs_reclaim_release(flags);
//如上三步是进行标志位的check

//在分配slab object时,线程可能会睡眠。比如GFP_KERNEL,这个GFP就会允许睡眠,及时的将CPU资源让给其他线程,例子:当分配slab 对象需要重新从buddy system重新获取page,然后创建一个slab缓存,再将object分配出去时,耗时会比较长,此时可以将CPU让出,让其他线程使用,当object可以获得时,再唤醒
	might_sleep_if(gfpflags_allow_blocking(flags));
//should_failslab正常是返回0
	if (should_failslab(s, flags))
		return NULL;

//如果使能了memcg_kmem,且flag和s->flags其中一个满足,则会执行
//将kmem_cache结构指针转换为属于mcgroup组的kmem_cache指针
	if (memcg_kmem_enabled() &&
	    ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
		return memcg_kmem_get_cache(s);
//如果未使能则返回
	return s;
}

2.2 object慢速分配的核心函数:__slab_alloc函数和___slab_alloc函数

 __slab_alloc函数

mm/slub.c
/*
 * Another one that disabled interrupt and compensates for possible
 * cpu changes by refetching the per cpu area pointer.
 */
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *p;
	unsigned long flags;
//禁止当前处理器的中断并且将它们之前的状态记录在flags中
	local_irq_save(flags);
#ifdef CONFIG_PREEMPT
	/*
	 * We may have been preempted and rescheduled on a different
	 * cpu before disabling interrupts. Need to reload cpu area
	 * pointer.
	 */
//如果开启的抢占,为了避免因调度切换到不同的CPU,通过this_cpu_ptr()重新获取s对应的CPU域的指针
	c = this_cpu_ptr(s->cpu_slab);
#endif
//核心函数
	p = ___slab_alloc(s, gfpflags, node, addr, c);
//根据之前记录状态的flags,恢复当前处理器的中断
	local_irq_restore(flags);
	return p;
}

下面开始介绍___slab_alloc函数函数

mm/slub.c
最重要的正主来了。。。。
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *freelist;
	struct page *page;
//得到slab缓存中cpu_slab中的page指针,指向正在使用的slab page
	page = c->page;

//如果c->page为空,即当前cpu 本地 slab为空,不存在,那么跳转到new_slab分支新分配一个
	if (!page) {
		/*
		 * if the node is not online or has no normal memory, just
		 * ignore the node constraint
		 */
//判断node是否是NUMA_NO_NODE,根据输入参数,默认就是NUMA_NO_NODE
		if (unlikely(node != NUMA_NO_NODE &&
			     !node_state(node, N_NORMAL_MEMORY)))
			node = NUMA_NO_NODE;
		goto new_slab;
	}
redo:
//如果c->page不为空,由于page不为空,且node是NUMA_NO_NODE,所以node_match返回1,这个if语句不会执行
	if (unlikely(!node_match(page, node))) {
		/*
		 * same as above but node_match() being false already
		 * implies node != NUMA_NO_NODE
		 */
//跟上面一样如果node_match返回0,则node不是NUMA_NO_NODE,这里将其置为NUMA_NO_NODE
		if (!node_state(node, N_NORMAL_MEMORY)) {
			node = NUMA_NO_NODE;
			goto redo;
		} else {
			stat(s, ALLOC_NODE_MISMATCH);
//如果页面与节点不匹配就通过deactivate_slab()去激活cpu本地slab
// 如果CPU正在使用的slab不属于当前的节点,或者如果page不具有PF_MEMALLOC,而且GFP是__GFP_NOMEMALLOC,就会将slab移除,放到node的partial list中,再通过new_slab分配一个新的page
			deactivate_slab(s, page, c->freelist, c);
			goto new_slab;
		}
	}

	/*
	 * By rights, we should be searching for a slab page that was
	 * PFMEMALLOC but right now, we are losing the pfmemalloc
	 * information when the page leaves the per-cpu allocator
	 */
//如果page不具有PF_MEMALLOC,而且GFP是__GFP_NOMEMALLOC,则要继续走deactivate_slab函数,再通过new_slab分配一个新的page
	if (unlikely(!pfmemalloc_match(page, gfpflags))) {
		deactivate_slab(s, page, c->freelist, c);
		goto new_slab;
	}

	/* must check again c->freelist in case of cpu migration or IRQ */
/*
再次检查空闲对象指针freelist是否为空,避免在禁止本地处理器中断前因发生了CPU迁移或者中断,
导致本地的空闲对象指针不为空,正常根据进入_slab_alloc的条件,第一次执行这里,这个freelist是NULL。后面的各种slowpath对应的跳变,它可能不是NULL
*/
	freelist = c->freelist;
	if (freelist)
//如果不为空,则跳转到load_freelist,得到一个空闲object
		goto load_freelist;
//slowpath-1 如果freelist为空,则从page中得到一个freelist,同时将page->freelist置为NULL
	freelist = get_freelist(s, page);
//如果获取的freelist为NULL,此时走new_slab,更新慢路径申请对象的DEACTIVATE_BYPASS统计信息
	if (!freelist) {
		c->page = NULL;
		stat(s, DEACTIVATE_BYPASS);
		goto new_slab;
	}
//如果获取的不为空,则更新慢路径申请对象的ALLOC_REFILL统计信息,进入load_freelist,slowpath-1
	stat(s, ALLOC_REFILL);

load_freelist:
	/*
	 * freelist is pointing to the list of objects to be used.
	 * page is pointing to the page from which the objects are obtained.
	 * That page must be frozen for per cpu allocations to work.
	 */
//按照规则,这个frozen一般是1(来自于CPU partial,或者本地CPU slab缓存),来自node partial,则frozen为0,不过当object来自Node partial时,对应的page脱离node partial, 挂到c->page上,frozen状态也会刷新为1
	VM_BUG_ON(!c->page->frozen);
//更新CPU 的freelist 指向下一个空闲的object
	c->freelist = get_freepointer(s, freelist);
//更新tid,然后返回分配的空闲object
	c->tid = next_tid(c->tid);
	return freelist;

new_slab:
//slowpath-2 如果从c->page分配失败(slowpath-1)且c->partial不为空,则从CPU partial中取出一个page以及page对应的slab
	if (slub_percpu_partial(c)) {
		page = c->page = slub_percpu_partial(c);
		slub_set_percpu_partial(c, page);//c->partial = page->next
//更新慢路径申请对象的CPU_PARTIAL_ALLOC统计信息,重新进入redo
		stat(s, CPU_PARTIAL_ALLOC);
		goto redo;
	}
//slowpath-3和slowpath-4 如果partial为空,意味着当前CPU partial所有的slab都已经满负荷使用,那么则需使用new_slab_objects()分配新的slab,这里涉及到从node partial或者从buddy system从得到新的slab,然后更新c->freelist
	freelist = new_slab_objects(s, gfpflags, node, &c);

	if (unlikely(!freelist)) {
//如果从node partial和buddy system中未能分配成功,则slab_out_of_memory记录日志(从kernellog中可以看到)后返回NULL表示申请失败
		slab_out_of_memory(s, gfpflags, node);
		return NULL;
	}

	page = c->page;
//如果分配成功,判断是否未开启调试且page属性及是否匹配pfmemalloc,如果未开启slub debug且匹配pfmemalloc,则跳转到load_freelist
//否则继续往下走
	if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
		goto load_freelist;

//如果开启slub debug,kmem_cache_debug返回1;但是debug初始化失败,则跳转new_slab继续重新分配
	/* Only entered in the debug case */
	if (kmem_cache_debug(s) &&
			!alloc_debug_processing(s, page, freelist, addr))
		goto new_slab;	/* Slab failed checks. Next slab needed */
//deactivate_slab去激活cpu本地slab,
	deactivate_slab(s, page, get_freepointer(s, freelist), c);
//最终返回slowpath分配的c->freelist
	return freelist;
}

2.2.1 pfmemalloc_match(page, gfpflags)函数

mm/internal.h
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN		WMARK_MIN
#define ALLOC_WMARK_LOW		WMARK_LOW
#define ALLOC_WMARK_HIGH	WMARK_HIGH
#define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */

/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)

/*
 * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
 * cannot assume a reduced access to memory reserves is sufficient for
 * !MMU
 */
#ifdef CONFIG_MMU
#define ALLOC_OOM		0x08
#else
#define ALLOC_OOM		ALLOC_NO_WATERMARKS
#endif

#define ALLOC_HARDER		0x10 /* try to alloc harder */
#define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET		0x40 /* check for correct cpuset */
#define ALLOC_CMA		0x80 /* allow allocations from CMA areas */


mm/page_alloc.c
static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
{
//如果GFP是__GFP_NOMEMALLOC,则返回0,此时就会执行deactivate_slab函数,进入new_slab
//其余标志都是根据如上信息,都是非0的
	if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
		return 0;
	if (gfp_mask & __GFP_MEMALLOC)
		return ALLOC_NO_WATERMARKS;
	if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
		return ALLOC_NO_WATERMARKS;
	if (!in_interrupt()) {
		if (current->flags & PF_MEMALLOC)
			return ALLOC_NO_WATERMARKS;
		else if (oom_reserves_allowed(current))
			return ALLOC_OOM;
	}

	return 0;
}

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
{
	return !!__gfp_pfmemalloc_flags(gfp_mask);
}

mm/slub.c
static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
{
//不具有PF_MEMALLOC属性,会进行mask的判断
	if (unlikely(PageSlabPfmemalloc(page)))
		return gfp_pfmemalloc_allowed(gfpflags);
//如果当前page具有PF_MEMALLOC属性,则直接返回TRUE
	return true;
}

2.2.2 new_slab_objects(s, gfpflags, node, &c)函数

mm/slub.c
/*1.1.1
 * Put a page that was just frozen (in __slab_free) into a partial page
 * slot if available.
 *
 * If we did not find a slot then simply move all the partials to the
 * per node partial list.
 */
static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
	struct page *oldpage;
	int pages;
	int pobjects;
//这个操作要关闭抢占
	preempt_disable();
	do {
		pages = 0;
		pobjects = 0;
//这oldpage是CPU partial链表第一个page,相比链表后面的page,需要多维护pobjects信息(当前CPU partial中空闲object数目)和pages信息(当前CPU partial中slab数目)		
		oldpage = this_cpu_read(s->cpu_slab->partial);

		if (oldpage) {
			pobjects = oldpage->pobjects;
			pages = oldpage->pages;
//根据输入参数,在进行obj alloc时,drain为0,正常不会执行;如果会执行(object free),如果空闲object数目大于s->cpu_partial
//则表示当前CPU partial链表已经满了,需要将此时CPU partial链表的所有page,全部移到per node partial上			
			if (drain && pobjects > s->cpu_partial) {
				unsigned long flags;
				/*
				 * partial array is full. Move the existing
				 * set to the per node partial list.
				 */
				local_irq_save(flags);
//将CPU partial链表中所有page,添加到这个page对应node partial的尾部,将frozen置为0,
//如果n->nr_partial >= s->min_partial,则将这个page添加到discard_page里面,然后统一释放这些page				
				unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
				local_irq_restore(flags);
				oldpage = NULL;
				pobjects = 0;
				pages = 0;
//更新CPU_PARTIAL_DRAIN状态信息				
				stat(s, CPU_PARTIAL_DRAIN);
			}
		}
//在alloc时,走的是这里,更新pages和pobjects
		pages++;
		pobjects += page->objects - page->inuse;
//将新来的page添加到CPU partial的链表首部,
		page->pages = pages;
		page->pobjects = pobjects;
		page->next = oldpage;
//如果上面page成功添加到CPU partial,this_cpu_cmpxchg函数此时返回的oldpage,不是page,则退出循环
	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
								!= oldpage);
//如果s->cpu_partial为空,说明这个CPU partial里面没有空闲的object,按照规则,会将其从CPU partial中脱离,将里面的page挂到对应的node partial上面,
//这个在文章slub allocator工作原理里面有 								
	if (unlikely(!s->cpu_partial)) {
		unsigned long flags;

		local_irq_save(flags);
		unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
		local_irq_restore(flags);
	}
	preempt_enable();
#endif
}

/*1.1
 * Try to allocate a partial slab from a specific node.
 */
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
				struct kmem_cache_cpu *c, gfp_t flags)
{
	struct page *page, *page2;
	void *object = NULL;
	unsigned int available = 0;
	int objects;

	/*
	 * Racy check. If we mistakenly see no partial slabs then we
	 * just allocate an empty slab. If we mistakenly try to get a
	 * partial slab and there is none available then get_partials()
	 * will return NULL.
	 */
//如果n或者n->nr_partial为空,则返回NULL
	if (!n || !n->nr_partial)
		return NULL;

	spin_lock(&n->list_lock);

//循环访问链表,利用变量page遍历节点n->partial指向的page链表
	list_for_each_entry_safe(page, page2, &n->partial, lru) {
		void *t;

//如果当前page不匹配pfmemalloc属性则继续循环
		if (!pfmemalloc_match(page, flags))
			continue;

//返回一个freelist指针,指向空闲的object或者NULL;函数会将page从对应的Node partial中移除,更新n->partial,将该page的frozen从0置为1,objects表示这个page对应的slab缓存中空闲的object数目
		t = acquire_slab(s, n, page, object == NULL, &objects);
//如果访问到链表末尾,此时t为NULL,则break
		if (!t)
			break;

		available += objects;//统计所有符合要求的page对应的slab缓存中空闲object数目

//第一次遍历时object初始化为空,走if,后面object更新为t,走else
		if (!object) {
//第一次会将这个page添加到本地cpu缓存对应的c->page中,并更新慢路径申请对象的ALLOC_FROM_PARTIAL统计信息
			c->page = page;
			stat(s, ALLOC_FROM_PARTIAL);
			object = t;//将t赋给object,更新freelist
		} else {

//1.1.1 将这个page添加到CPU partial中,并更新慢路径申请对象的CPU_PARTIAL_NODE统计信息
			put_cpu_partial(s, page, 0);
			stat(s, CPU_PARTIAL_NODE);
		}

//如果使能了SLUB_CPU_PARTIAL(默认使能), kmem_cache_has_cpu_partial返回非0,否则返回0。返回0,则取反为1,根据逻辑操作的短路特性,直接break,退出循环
//如果未使能,此时会判断当前迁移的空闲object对象数目是否超过s->cpu_partial的一半,如果是,则break,否则继续循环
		if (!kmem_cache_has_cpu_partial(s)
			|| available > slub_cpu_partial(s) / 2)
			break;

	}
	spin_unlock(&n->list_lock);
//返回c->freelist指针
	return object;
}


/*1
 * Get a partial page, lock it and return it.
 */
static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
		struct kmem_cache_cpu *c)
{
	void *object;
	int searchnode = node;

	if (node == NUMA_NO_NODE)
//返回找到该CPU指定的本地内存节点,实际上得到一个索引
		searchnode = numa_mem_id();
	else if (!node_present_pages(node))
		searchnode = node_to_mem_node(node);

//get_node根据searchnode,得到s->node[searchnode]
//1.1 get_partial_node从指定的node(s->node[searchnode]),迁移部分slab到CPU partial,同时更新c->page,并返回c->freelist
	object = get_partial_node(s, get_node(s, searchnode), c, flags);

//如果从node partial中成功分配,则object不应该为空,直接返回,根据输入参数node本身就是NUMA_NO_NODE
	if (object || node != NUMA_NO_NODE)
		return object;

//从s->node中非searchnode里面找寻合适的节点,分配object,同样也会迁移部分slab到CPU partial,更新c->freelist和c->page,并返回c->freelist
//不过是否执行,得判断。第一,是否是NUMA系统,只有NUMA系统才会进行下去,非NUMA,直接返回NULL;第二,s->remote_node_defrag_ratio值是否足够大,因为该值越小,越倾向于从本节点(searchnode )分配对象
	return get_any_partial(s, flags, c);
}



//2.1 从buddy system中获取到page,并初始化page的一些参数
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
	struct page *page;
	struct kmem_cache_order_objects oo = s->oo;//赋值
	gfp_t alloc_gfp;
	void *start, *p;
	int idx, order;
	bool shuffle;

	flags &= gfp_allowed_mask;

	if (gfpflags_allow_blocking(flags))
		local_irq_enable();

	flags |= s->allocflags;

	/*
	 * Let the initial higher-order allocation fail under memory pressure
	 * so we fall-back to the minimum order allocation.
	 */
	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
	if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
//根据alloc_gfp和oo里面的order,得到page
	page = alloc_slab_page(s, alloc_gfp, node, oo);
	if (unlikely(!page)) {
//如果按照s->oo无法分配到page,就按照s->min标准(slab里面只有一个object),继续尝试
		oo = s->min;
		alloc_gfp = flags;
		/*
		 * Allocation may have failed due to fragmentation.
		 * Try a lower order alloc if possible
		 */
		page = alloc_slab_page(s, alloc_gfp, node, oo);
//如果依旧无法分配到page,直接跳到out
		if (unlikely(!page))
			goto out;
//否则,按照oo=s->min标准,成功分配后,更新状态ORDER_FALLBACK
		stat(s, ORDER_FALLBACK);
	}

//成功得到page后,更新page->objects参数(slab中object总数)
	page->objects = oo_objects(oo);

	order = compound_order(page);
	page->slab_cache = s;
	__SetPageSlab(page);
	if (page_is_pfmemalloc(page))
		SetPageSlabPfmemalloc(page);
		
//获取page的虚拟地址
	start = page_address(page);

//如果开启了kasan,tag的初始化
	if (unlikely(s->flags & SLAB_POISON))
		memset(start, POISON_INUSE, PAGE_SIZE << order);

	kasan_poison_slab(page);
	
//如果未使能CONFIG_SLAB_FREELIST_RANDOM,直接返回FALSE,如果使能了,会对page->freelist指向的空闲object链表进行随机化排序,增加安全性,降低可预测性
	shuffle = shuffle_freelist(s, page);
	
//如果shuffle为FALSE,则使用下面方式对空闲object链表进行随机化
	if (!shuffle) {
		for_each_object_idx(p, idx, s, start, page->objects) {
			setup_object(s, page, p);
			if (likely(idx < page->objects))
				set_freepointer(s, p, p + s->size);
			else
				set_freepointer(s, p, NULL);//最后一个空闲对象next指针设为NULL

		}
		page->freelist = fixup_red_left(s, start);
	}
//更新参数,page刚创建的时候page->inuse等于object的个数;因为将添加到c->page里面,frozen置为1
	page->inuse = page->objects;
	page->frozen = 1;

out:
	if (gfpflags_allow_blocking(flags))
		local_irq_disable();
	if (!page)
		return NULL;

	mod_lruvec_page_state(page,
		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
		1 << oo_order(oo));

	inc_slabs_node(s, page_to_nid(page), page->objects);
//返回page
	return page;
}


//2 从buddy system中获取到page,并初始化page的一些参数
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
//如果flags里面有GFP_SLAB_BUG_MASK标志位,则打印warn信息,dump stack,不会从buddy system中分配page,更新slab缓存
	if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
		gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
		flags &= ~GFP_SLAB_BUG_MASK;
		pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
				invalid_mask, &invalid_mask, flags, &flags);
		dump_stack();
	}
//2.1 正常而且默认是走这里,核心函数
	return allocate_slab(s,
		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
}

mm/slub.c
//正主。。。。。。。
static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
			int node, struct kmem_cache_cpu **pc)
{
	void *freelist;
	struct kmem_cache_cpu *c = *pc;
	struct page *page;

	WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
//1、slowpath-3 从Node 管理的partial链表中迁移部分slab 到cpu partial链表中,同时更新c->page,并返回c->freelist
	freelist = get_partial(s, flags, node, c);

//如果从node partial中分配到了,则直接返回
	if (freelist)
		return freelist;

//2、slowpath-4 否则,调用new_slab函数创建一个slab,从buddy system中获取到page,并初始化page的一些参数
	page = new_slab(s, flags, node);

//如果成功从buddy system中得到page
	if (page) {
//得到s对应的CPU
		c = raw_cpu_ptr(s->cpu_slab);
		if (c->page)
//如果c->page不为空,由于此时需要将从buddy system中的page,给到per cpu,那么就要需要原有的c->page移除即取消CPU与slab的关联 ,添加到node partial中,将c->page和c->freelist置为NULL
//一般c->page不为空有如下两种情况,
/*当gfpflags置位__GFP_WAIT,在new_slab()期间,中断被打开,可能出现两种情况:
1.其他的内核路径执行了__slab_alloc(),导致当前CPU关联上其他的slab
2.线程切换CPU,有可能现在的代码已经不在先前运行的CPU上执行*/
			flush_slab(s, c);

		/*
		 * No other reference to the page yet so we can
		 * muck around with it freely without cmpxchg
		 */
//更新c->freelist和c->page
		freelist = page->freelist;
		page->freelist = NULL;
//slowpath-4 更新慢路径申请对象的CPU_PARTIAL_NODE统计信息 
		stat(s, ALLOC_SLAB);
		c->page = page;
		*pc = c;
	} else
//如果未成功从buddy system中得到page,c->freelist置为NULL
		freelist = NULL;

	return freelist;
}

参考资料:

【原创】(十一)Linux内存管理slub分配器

Slub分配器原理

ARM64内存管理九:slub申请内存

【Linux内存源码分析】SLUB分配算法(4)

图解slub

mm-slab对象的分配

mm-slab对象的回收

内存管理:C语言中slub释放代码分析

戏说slub分配器

  • 4
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值