linux3.10 内存管理(四)slab机制详解1 初始化上

在讲初始化之前,先看一下slab机制中用到的主要结构体:

struct kmem_cache {
/* 1) Cache tunables. Protected by cache_chain_mutex */
	unsigned int batchcount;  从slab放入cpu array的object个数
	unsigned int limit;
	unsigned int shared;

	unsigned int size;对其后实际分配的object的size的大小
	u32 reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */

	unsigned int flags;		/* constant flags */
	unsigned int num;	slab中object的数量	/* # of objs per slab */

/* 3) cache_grow/shrink */
	/* order of pgs per slab (2^n) */
	unsigned int gfporder;     占用的页的个数

	/* force GFP flags, e.g. GFP_DMA */
	gfp_t allocflags;

	size_t colour;			/* cache colouring range */
	unsigned int colour_off;	/* colour offset */
	struct kmem_cache *slabp_cache;
	unsigned int slab_size;  slab管理结构的大小

	/* constructor func */
	void (*ctor)(void *obj);

/* 4) cache creation/removal */
	const char *name;
	struct list_head list;  用来把多个kmem_cache 串起来
	int refcount;
	int object_size;   每个object的大小
	int align;

/* 5) statistics */
#ifdef CONFIG_DEBUG_SLAB
	unsigned long num_active;
	unsigned long num_allocations;
	unsigned long high_mark;
	unsigned long grown;
	unsigned long reaped;
	unsigned long errors;
	unsigned long max_freeable;
	unsigned long node_allocs;
	unsigned long node_frees;
	unsigned long node_overflow;
	atomic_t allochit;
	atomic_t allocmiss;
	atomic_t freehit;
	atomic_t freemiss;

	/*
	 * If debugging is enabled, then the allocator can add additional
	 * fields and/or padding to every object. size contains the total
	 * object size including these internal fields, the following two
	 * variables contain the offset to the user object and its size.
	 */
	int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
#ifdef CONFIG_MEMCG_KMEM
	struct memcg_cache_params *memcg_params;
#endif

/* 6) per-cpu/per-node data, touched during every alloc/free */
	/*
	 * We put array[] at the end of kmem_cache, because we want to size
	 * this array to nr_cpu_ids slots instead of NR_CPUS
	 * (see kmem_cache_init())
	 * We still use [NR_CPUS] and not [1] or [0] because cache_cache
	 * is statically defined, so we reserve the max number of cpus.
	 *
	 * We also need to guarantee that the list is able to accomodate a
	 * pointer for each node since "nodelists" uses the remainder of
	 * available pointers.
	 */
	struct kmem_cache_node **node;管理slab数据



	struct array_cache *array[NR_CPUS + MAX_NUMNODES];
        per_cpu数据,记录了本地高速缓存的信息,每次获取object都是通过它
	/*
	 * Do not add fields after array[]
	 */
};

每一种内存大小的类型,都用kmem_cache结构来表示,多个kmem_cache又挂载在slab_caches链表上。

struct kmem_cache_node {
	spinlock_t list_lock;

#ifdef CONFIG_SLAB
	struct list_head slabs_partial;	slab中内存部分使用以后,挂载在这个链表
	struct list_head slabs_full;     slab中内存全部使用以后,挂载在这个链表
	struct list_head slabs_free;     slab中内存没有使用则挂载在这个链表
	unsigned long free_objects;
	unsigned int free_limit;
	unsigned int colour_next;	/* Per-node cache coloring */
	struct array_cache *shared;	/* shared per node */
	struct array_cache **alien;	/* on other nodes */
	unsigned long next_reap;	/* updated without locking */
	int free_touched;		/* updated without locking */
#endif

#ifdef CONFIG_SLUB
	unsigned long nr_partial;
	struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
	atomic_long_t nr_slabs;
	atomic_long_t total_objects;
	struct list_head full;
#endif
#endif

}

kmem_cache_node是用来管理具体的slab的,根据slab的不同使用情况,挂载不同的链表上面。

struct array_cache {
	unsigned int avail;  该缓存里可提供的object数量
	unsigned int limit;
	unsigned int batchcount;  每次从slab中取object的数量
	unsigned int touched;
	spinlock_t lock;
	void *entry[];	object都放在这边
};

array_cache用来管理高速缓存,从slab系统申请内存的时候,优先从这边分配

struct slab {
	union {
		struct {
			struct list_head list;
			unsigned long colouroff;
			void *s_mem;		分配的内存的起始地址
			unsigned int inuse;	/* num of objs active in slab */
			kmem_bufctl_t free;  指向下一个空闲的object
			unsigned short nodeid;
		};
		struct slab_rcu __slab_cover_slab_rcu;
	};
};

slab是用来管理object的,每次从slab系统申请空间,需要返回object。

系统初始化的时候,会进行如下的调用关系来初始化slab系统:

start_kernel

    ---------->mm_init

         ----------->kmem_cache_init

void __init kmem_cache_init(void)
{
	int i;

	kmem_cache = &kmem_cache_boot; 把静态值赋值给kmem_cache
	setup_node_pointer(kmem_cache);初始化kmem_cache node的值为最后一个cpu cache后面的空
                                       间,这样就可以给node指针赋值了

	if (num_possible_nodes() == 1)
		use_alien_caches = 0;

	for (i = 0; i < NUM_INIT_LISTS; i++)    初始化init_kmem_cache_node
		kmem_cache_node_init(&init_kmem_cache_node[i]);

	set_up_node(kmem_cache, CACHE_CACHE); node[0]指向init_kmem_cache_node[0]

	/*
	 * Fragmentation resistance on low memory - only use bigger
	 * page orders on machines with more than 32MB of memory if
	 * not overridden on the command line.
	 */
	if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
		slab_max_order = SLAB_MAX_ORDER_HI;

	/* Bootstrap is tricky, because several objects are allocated
	 * from caches that do not exist yet:
	 * 1) initialize the kmem_cache cache: it contains the struct
	 *    kmem_cache structures of all caches, except kmem_cache itself:
	 *    kmem_cache is statically allocated.
	 *    Initially an __init data area is used for the head array and the
	 *    kmem_cache_node structures, it's replaced with a kmalloc allocated
	 *    array at the end of the bootstrap.
	 * 2) Create the first kmalloc cache.
	 *    The struct kmem_cache for the new cache is allocated normally.
	 *    An __init data area is used for the head array.
	 * 3) Create the remaining kmalloc caches, with minimally sized
	 *    head arrays.
	 * 4) Replace the __init data head arrays for kmem_cache and the first
	 *    kmalloc cache with kmalloc allocated arrays.
	 * 5) Replace the __init data for kmem_cache_node for kmem_cache and
	 *    the other cache's with kmalloc allocated memory.
	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
	 */

	/* 1) create the kmem_cache */

	/*
	 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
	 */
        为kmem_cache 创建slab,该结构用来为后续的kmem_cache 创建分配空间
	create_boot_cache(kmem_cache, "kmem_cache",
		offsetof(struct kmem_cache, array[nr_cpu_ids]) +
				  nr_node_ids * sizeof(struct kmem_cache_node *),
				  SLAB_HWCACHE_ALIGN);
        然后把kmem_cache挂载到slab_caches,所有的kmem_cache都会挂载到slab_caches
	list_add(&kmem_cache->list, &slab_caches);

	/* 2+3) create the kmalloc caches */

	/*
	 * Initialize the caches that provide memory for the array cache and the
	 * kmem_cache_node structures first.  Without this, further allocations will
	 * bug.
	 */
	创建一个名字叫kmalloc-ac的kmem_cache,并且为其创建slab,该结构用来为后续的array_cache
        分配空间
	kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
					kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);

        创建一个名字叫kmalloc-node的kmem_cache,并且为其创建slab,该结构用来为后续的
        kmem_cache_node分配空间
	if (INDEX_AC != INDEX_NODE)
		kmalloc_caches[INDEX_NODE] =
			create_kmalloc_cache("kmalloc-node",
				kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);

	slab_early_init = 0;


	/* 4) Replace the bootstrap head arrays */
	{
		struct array_cache *ptr;
                向slab系统申请arraycache_init,用来取代之前使用的静态的arraycache_init
		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);

		memcpy(ptr, cpu_cache_get(kmem_cache),
		       sizeof(struct arraycache_init));
		/*
		 * Do not assume that spinlocks can be initialized via memcpy:
		 */
		spin_lock_init(&ptr->lock);

		kmem_cache->array[smp_processor_id()] = ptr;

		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);

		BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
		       != &initarray_generic.cache);
		memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
		       sizeof(struct arraycache_init));
		/*
		 * Do not assume that spinlocks can be initialized via memcpy:
		 */
		spin_lock_init(&ptr->lock);

		kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
	}
	/* 5) Replace the bootstrap kmem_cache_node */
	{
		int nid;
                向slab系统申请arraycache_init,用来取代之前使用的静态的kmem_cache_node
		for_each_online_node(nid) {
			init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);

			init_list(kmalloc_caches[INDEX_AC],
				  &init_kmem_cache_node[SIZE_AC + nid], nid);

			if (INDEX_AC != INDEX_NODE) {
				init_list(kmalloc_caches[INDEX_NODE],
					  &init_kmem_cache_node[SIZE_NODE + nid], nid);
			}
		}
	}
        再分配一些其他大小的kmem_cache
	create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
}

slab系统依赖于伙伴系统,用于申请小内存时,把一页划分成多个object,来实现内存的更有效使用。

上面的代码,在slab系统没起来之间,只好用一些静态的结构来实现slab系统的初始化,等初始化完以后,再从slab系统分配内存,把这些结构替换掉。

上述代码的第一阶段,先为名字叫做kmem_cache和结构申请slab,这边kmem_cache使用了静态结构,里面的kmem_cache_node和array_cache都使用静态结构。

第二阶段,为名字叫kmalloc-ac的结构申请slab,这个时候kmem_cache slab已经初始化好,所以这边只使用了两个静态结构kmem_cache_node和array_cache。

第三阶段,为名字叫kmalloc-node的结构申请slab,这个时候array_cache slab也已经ok,所以只是用了kmem_cache_node静态结构。

第四第五阶段,除了第一个静态结构kmem_cache外,把其他的静态结构用新分配的slab替换掉。

下面先看一下setup_node_pointer

static void setup_node_pointer(struct kmem_cache *cachep)
{
	cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
}

由于一开始,node是个struct kmem_cache_node **类型,所以把node指向array[1],这边cpu cache只是用array[0],然后就可以使用node[0]这样的方式给node赋值。接着调用set_up_node,使用静态变量为cachep->node赋值,然后就可以使用cachep->node了。

然后调用create_boot_cache初始化kmem_cache的slab,slab里object的大小为kmem_cache的大小加struct kmem_cache_node *的大小。

create_boot_cache

     -------->__kmem_cache_create

int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
	size_t left_over, slab_size, ralign;
	gfp_t gfp;
	int err;
	size_t size = cachep->size;


	/*
	 * Check that size is in terms of words.  This is needed to avoid
	 * unaligned accesses for some archs when redzoning is used, and makes
	 * sure any on-slab bufctl's are also correctly aligned.
	 */
        对size进行字节对齐
	if (size & (BYTES_PER_WORD - 1)) {
		size += (BYTES_PER_WORD - 1);
		size &= ~(BYTES_PER_WORD - 1);
	}

	/*
	 * Redzoning and user store require word alignment or possibly larger.
	 * Note this will be overridden by architecture or caller mandated
	 * alignment if either is greater than BYTES_PER_WORD.
	 */
	if (flags & SLAB_STORE_USER)
		ralign = BYTES_PER_WORD;

	if (flags & SLAB_RED_ZONE) {
		ralign = REDZONE_ALIGN;
		/* If redzoning, ensure that the second redzone is suitably
		 * aligned, by adjusting the object size accordingly. */
		size += REDZONE_ALIGN - 1;
		size &= ~(REDZONE_ALIGN - 1);
	}

	/* 3) caller mandated alignment */
	if (ralign < cachep->align) {
		ralign = cachep->align;
	}
	/* disable debug if necessary */
	if (ralign > __alignof__(unsigned long long))
		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
	/*
	 * 4) Store it.
	 */
	cachep->align = ralign;

	if (slab_is_available())
		gfp = GFP_KERNEL;
	else
		gfp = GFP_NOWAIT;

	setup_node_pointer(cachep);

	/*
	 * Determine if the slab management is 'on' or 'off' slab.
	 * (bootstrapping cannot cope with offslab caches so don't do
	 * it too early on. Always use on-slab management when
	 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
	 */
	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
	    !(flags & SLAB_NOLEAKTRACE))
		/*
		 * Size is large, assume best to place the slab management obj
		 * off-slab (should allow better packing of objs).
		 */
		flags |= CFLGS_OFF_SLAB;

	size = ALIGN(size, cachep->align);
        计算分配slab需要多少空间,最后有几个object,left_over 是浪费的空间
	left_over = calculate_slab_order(cachep, size, cachep->align, flags);

	if (!cachep->num)
		return -E2BIG;
        计算slab管理结构的大小
	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
			  + sizeof(struct slab), cachep->align);

	/*
	 * If the slab has been placed off-slab, and we have enough space then
	 * move it on-slab. This is at the expense of any extra colouring.
	 */
CFLGS_OFF_SLAB 都是没有设置的
	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
		flags &= ~CFLGS_OFF_SLAB;
		left_over -= slab_size;
	}

	if (flags & CFLGS_OFF_SLAB) {
		/* really off slab. No need for manual alignment */
		slab_size =
		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);

#ifdef CONFIG_PAGE_POISONING
		/* If we're going to use the generic kernel_map_pages()
		 * poisoning, then it's going to smash the contents of
		 * the redzone and userword anyhow, so switch them off.
		 */
		if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
	}

	cachep->colour_off = cache_line_size();
	/* Offset must be a multiple of the alignment. */
	if (cachep->colour_off < cachep->align)
		cachep->colour_off = cachep->align;
	cachep->colour = left_over / cachep->colour_off;
	cachep->slab_size = slab_size;
	cachep->flags = flags;
	cachep->allocflags = 0;
	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
		cachep->allocflags |= GFP_DMA;
	cachep->size = size;
	cachep->reciprocal_buffer_size = reciprocal_value(size);

	if (flags & CFLGS_OFF_SLAB) {
		cachep->slabp_cache = kmalloc_slab(slab_size, 0u);
		/*
		 * This is a possibility for one of the malloc_sizes caches.
		 * But since we go off slab only for object size greater than
		 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
		 * this should not happen at all.
		 * But leave a BUG_ON for some lucky dude.
		 */
		BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
	}
        设置array_cache
	err = setup_cpu_cache(cachep, gfp);
	if (err) {
		__kmem_cache_shutdown(cachep);
		return err;
	}

	if (flags & SLAB_DEBUG_OBJECTS) {
		/*
		 * Would deadlock through slab_destroy()->call_rcu()->
		 * debug_object_activate()->kmem_cache_alloc().
		 */
		WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);

		slab_set_debugobj_lock_classes(cachep);
	} else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
		on_slab_lock_classes(cachep);

	return 0;
}

这边比较重要的两个函数是calculate_slab_order和setup_cpu_cache

static size_t calculate_slab_order(struct kmem_cache *cachep,
			size_t size, size_t align, unsigned long flags)
{
	unsigned long offslab_limit;
	size_t left_over = 0;
	int gfporder;

	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
		unsigned int num;
		size_t remainder;
                计算object的数量
		cache_estimate(gfporder, size, align, flags, &remainder, &num);
		if (!num)
			continue;

		if (flags & CFLGS_OFF_SLAB) {
			/*
			 * Max number of objs-per-slab for caches which
			 * use off-slab slabs. Needed to avoid a possible
			 * looping condition in cache_grow().
			 */
			offslab_limit = size - sizeof(struct slab);
			offslab_limit /= sizeof(kmem_bufctl_t);

 			if (num > offslab_limit)
				break;
		}

		/* Found something acceptable - save it away */
		cachep->num = num;  object的数量
		cachep->gfporder = gfporder;  需要1<<gfporder个page
		left_over = remainder;

		/*
		 * A VFS-reclaimable slab tends to have most allocations
		 * as GFP_NOFS and we really don't want to have to be allocating
		 * higher-order pages when we are unable to shrink dcache.
		 */
		if (flags & SLAB_RECLAIM_ACCOUNT)
			break;

		/*
		 * Large number of objects is good, but very large slabs are
		 * currently bad for the gfp()s.
		 */
		if (gfporder >= slab_max_order)
			break;

		/*
		 * Acceptable internal fragmentation?
		 */
		if (left_over * 8 <= (PAGE_SIZE << gfporder))
			break;
	}
	return left_over;
}

通过cache_estimate来计算分配几个object:

static void cache_estimate(unsigned long gfporder, size_t buffer_size,
			   size_t align, int flags, size_t *left_over,
			   unsigned int *num)
{
	int nr_objs;
	size_t mgmt_size;
	size_t slab_size = PAGE_SIZE << gfporder;

	/*
	 * The slab management structure can be either off the slab or
	 * on it. For the latter case, the memory allocated for a
	 * slab is used for:
	 *
	 * - The struct slab
	 * - One kmem_bufctl_t for each object
	 * - Padding to respect alignment of @align
	 * - @buffer_size bytes for each object
	 *
	 * If the slab management structure is off the slab, then the
	 * alignment will already be calculated into the size. Because
	 * the slabs are all pages aligned, the objects will be at the
	 * correct alignment when allocated.
	 */
	if (flags & CFLGS_OFF_SLAB) {
		mgmt_size = 0;
		nr_objs = slab_size / buffer_size;

		if (nr_objs > SLAB_LIMIT)
			nr_objs = SLAB_LIMIT;
	} else {
		/*
		 * Ignore padding for the initial guess. The padding
		 * is at most @align-1 bytes, and @buffer_size is at
		 * least @align. In the worst case, this result will
		 * be one greater than the number of objects that fit
		 * into the memory allocation when taking the padding
		 * into account.
		 */
object的数量为页大小减去管理结构slab的大小,除以每个object的大小,其中,有一个object,就需要一个
kmem_bufctl_t来指示器位置
		nr_objs = (slab_size - sizeof(struct slab)) /
			  (buffer_size + sizeof(kmem_bufctl_t));

		/*
		 * This calculated number will be either the right
		 * amount, or one greater than what we want.
		 */
		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
		       > slab_size)
			nr_objs--;

		if (nr_objs > SLAB_LIMIT)
			nr_objs = SLAB_LIMIT;
            计算slab管理结构的大小
		mgmt_size = slab_mgmt_size(nr_objs, align);
	}
	*num = nr_objs;
        浪费的空间
	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
}

通过上面计算出了需要申请的页的大小,以及在这片空间中,可以分配几个object等一系列初始化,接着调用setup_cpu_cache

static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
	if (slab_state >= FULL)
		return enable_cpucache(cachep, gfp);

	if (slab_state == DOWN) {
		/*
		 * Note: Creation of first cache (kmem_cache).
		 * The setup_node is taken care
		 * of by the caller of __kmem_cache_create
		 */
		cachep->array[smp_processor_id()] = &initarray_generic.cache;
		slab_state = PARTIAL;
	} else if (slab_state == PARTIAL) {
		/*
		 * Note: the second kmem_cache_create must create the cache
		 * that's used by kmalloc(24), otherwise the creation of
		 * further caches will BUG().
		 */
		cachep->array[smp_processor_id()] = &initarray_generic.cache;

		/*
		 * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
		 * the second cache, then we need to set up all its node/,
		 * otherwise the creation of further caches will BUG().
		 */
		set_up_node(cachep, SIZE_AC);
		if (INDEX_AC == INDEX_NODE)
			slab_state = PARTIAL_NODE;
		else
			slab_state = PARTIAL_ARRAYCACHE;
	} else {
		/* Remaining boot caches */
		cachep->array[smp_processor_id()] =
			kmalloc(sizeof(struct arraycache_init), gfp);

		if (slab_state == PARTIAL_ARRAYCACHE) {
			set_up_node(cachep, SIZE_NODE);
			slab_state = PARTIAL_NODE;
		} else {
			int node;
			for_each_online_node(node) {
				cachep->node[node] =
				    kmalloc_node(sizeof(struct kmem_cache_node),
						gfp, node);
				BUG_ON(!cachep->node[node]);
				kmem_cache_node_init(cachep->node[node]);
			}
		}
	}
	cachep->node[numa_mem_id()]->next_reap =
			jiffies + REAPTIMEOUT_LIST3 +
			((unsigned long)cachep) % REAPTIMEOUT_LIST3;

	cpu_cache_get(cachep)->avail = 0;
	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
	cpu_cache_get(cachep)->batchcount = 1;
	cpu_cache_get(cachep)->touched = 0;
	cachep->batchcount = 1;
	cachep->limit = BOOT_CPUCACHE_ENTRIES;
	return 0;
}

上面函数由于slab_state最初是DOWN,所以用静态变量initarray_generic初始化cachep->array,注意,然后会设置slab_state = PARTIAL,后面为array_cache申请slab的时候,还是指向同一个静态变量initarray_generic,不过由于batchcount设置为1,所以每次只有一个object在这里,应该问题不大。这边初始化完cachep->array以后,就可以申请kmem_cache了,并且设置avail为0,说明cpu cache中暂时没有slab,需要申请分配,batchcount为1,每次向slab取一个,真正为slab分配空间是向其申请内存的时候,接着往下看代码,这边初始化完了kmem_cache,然后初始化kmalloc-ac。

kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
                    kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);

可以看到初始化完以后,会把得到的kmem_cache放到kmalloc_caches中,kmalloc_caches是个数组,slab通过一个计算方式,会把不通大小的slab放入kmalloc_caches数组中,这样要申请对应大小范围的内存就可以根据索引从数组中取出kmem_cache。

struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
				unsigned long flags)
{
        向slab系统申请kmem_cache 空间
	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);

	if (!s)
		panic("Out of memory when creating slab %s\n", name);
        创建自己的slab
	create_boot_cache(s, name, size, flags);
	list_add(&s->list, &slab_caches);
	s->refcount = 1;
	return s;
}

kmem_cache_alloc

     --------->slab_alloc

          ----------->__do_cache_alloc

               ------------>____cache_alloc

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
	void *objp;
	struct array_cache *ac;
	bool force_refill = false;

	check_irq_off();
           取出cachep->array
	ac = cpu_cache_get(cachep);
        第一次进来,avail为0
	if (likely(ac->avail)) {
		ac->touched = 1;
		objp = ac_get_obj(cachep, ac, flags, false);

		/*
		 * Allow for the possibility all avail objects are not allowed
		 * by the current flags
		 */
		if (objp) {
			STATS_INC_ALLOCHIT(cachep);
			goto out;
		}
		force_refill = true;
	}

	STATS_INC_ALLOCMISS(cachep);
            这边进行分配object
	objp = cache_alloc_refill(cachep, flags, force_refill);
	/*
	 * the 'ac' may be updated by cache_alloc_refill(),
	 * and kmemleak_erase() requires its correct value.
	 */
	ac = cpu_cache_get(cachep);

out:
	/*
	 * To avoid a false negative, if an object that is in one of the
	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
	 * treat the array pointers as a reference to the object.
	 */
	if (objp)
		kmemleak_erase(&ac->entry[ac->avail]);
	return objp;
}

调用cache_alloc_refill分配object

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
							bool force_refill)
{
	int batchcount;
	struct kmem_cache_node *n;
	struct array_cache *ac;
	int node;

	check_irq_off();
	node = numa_mem_id();
	if (unlikely(force_refill))
		goto force_grow;
retry:
	ac = cpu_cache_get(cachep);
	batchcount = ac->batchcount;
	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
		/*
		 * If there was little recent activity on this cache, then
		 * perform only a partial refill.  Otherwise we could generate
		 * refill bouncing.
		 */
		batchcount = BATCHREFILL_LIMIT;
	}
	n = cachep->node[node];

	BUG_ON(ac->avail > 0 || !n);
	spin_lock(&n->list_lock);

	/* See if we can refill from the shared array */
	if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
		n->shared->touched = 1;
		goto alloc_done;
	}

	while (batchcount > 0) {
		struct list_head *entry;
		struct slab *slabp;
		/* Get slab alloc is to come from. */
		entry = n->slabs_partial.next;
		if (entry == &n->slabs_partial) {
			n->free_touched = 1;
			entry = n->slabs_free.next;
			if (entry == &n->slabs_free)  第一次进来,node都是空的,去执行    
                                                                    must_grow
				goto must_grow;
		}

		slabp = list_entry(entry, struct slab, list);
		check_slabp(cachep, slabp);
		check_spinlock_acquired(cachep);

		/*
		 * The slab was either on partial or free list so
		 * there must be at least one object available for
		 * allocation.
		 */
		BUG_ON(slabp->inuse >= cachep->num);

		while (slabp->inuse < cachep->num && batchcount--) {
			STATS_INC_ALLOCED(cachep);
			STATS_INC_ACTIVE(cachep);
			STATS_SET_HIGH(cachep);

			ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
									node));
		}
		check_slabp(cachep, slabp);

		/* move slabp to correct slabp list: */
		list_del(&slabp->list);
		if (slabp->free == BUFCTL_END)
			list_add(&slabp->list, &n->slabs_full);
		else
			list_add(&slabp->list, &n->slabs_partial);
	}

must_grow:
	n->free_objects -= ac->avail;
alloc_done:
	spin_unlock(&n->list_lock);

	if (unlikely(!ac->avail)) {
		int x;
force_grow:
                 申请空间
		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);

		/* cache_grow can reenable interrupts, then ac could change. */
		ac = cpu_cache_get(cachep);
		node = numa_mem_id();

		/* no objects in sight? abort */
		if (!x && (ac->avail == 0 || force_refill))
			return NULL;
                申请完再尝试把object放到array中
		if (!ac->avail)		/* objects refilled by interrupt? */
			goto retry;
	}
	ac->touched = 1;

	return ac_get_obj(cachep, ac, flags, force_refill);
}

上面函数,刚开始是没有为slab申请object的,所以执行must_grow,为node分配空间,最后再执行retry,执行ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,node));把object 从node 挂载到cpu array中。

先看cache_grow

static int cache_grow(struct kmem_cache *cachep,
		gfp_t flags, int nodeid, void *objp)
{
	struct slab *slabp;
	size_t offset;
	gfp_t local_flags;
	struct kmem_cache_node *n;

	/*
	 * Be lazy and only check for valid flags here,  keeping it out of the
	 * critical path in kmem_cache_alloc().
	 */
	BUG_ON(flags & GFP_SLAB_BUG_MASK);
	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);

	/* Take the node list lock to change the colour_next on this node */
	check_irq_off();
	n = cachep->node[nodeid];
	spin_lock(&n->list_lock);

	/* Get colour for the slab, and cal the next value. */
	offset = n->colour_next;
	n->colour_next++;
	if (n->colour_next >= cachep->colour)
		n->colour_next = 0;
	spin_unlock(&n->list_lock);

	offset *= cachep->colour_off;

	if (local_flags & __GFP_WAIT)
		local_irq_enable();

	/*
	 * The test for missing atomic flag is performed here, rather than
	 * the more obvious place, simply to reduce the critical path length
	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
	 * will eventually be caught here (where it matters).
	 */
	kmem_flagcheck(cachep, flags);

	/*
	 * Get mem for the objs.  Attempt to allocate a physical page from
	 * 'nodeid'.
	 */
	if (!objp)
                向伙伴系统申请page,返回该page指向的虚拟地址
		objp = kmem_getpages(cachep, local_flags, nodeid);
	if (!objp)
		goto failed;

	/* Get slab management. */
        利用申请到的内存,初始化slab
	slabp = alloc_slabmgmt(cachep, objp, offset,
			local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
	if (!slabp)
		goto opps1;
        把slab和page进行关联
	slab_map_pages(cachep, slabp, objp);
        在slab管理数据后面,为kmem_bufctl_t结构赋值,每个kmem_bufctl_t结构里面放入下一个object
        的index
	cache_init_objs(cachep, slabp);

	if (local_flags & __GFP_WAIT)
		local_irq_disable();
	check_irq_off();
	spin_lock(&n->list_lock);

	/* Make slab active. */
        这时候slab结构没有分配过,所以挂载在slabs_free上
	list_add_tail(&slabp->list, &(n->slabs_free));
	STATS_INC_GROWN(cachep);
	n->free_objects += cachep->num;
	spin_unlock(&n->list_lock);
	return 1;
opps1:
	kmem_freepages(cachep, objp);
failed:
	if (local_flags & __GFP_WAIT)
		local_irq_disable();
	return 0;
}

kmem_getpages最终调用到__alloc_pages,向伙伴系统申请page,返回该page代表的虚拟地址。然后利用申请到的内存块,初始化slab:

static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
				   int colour_off, gfp_t local_flags,
				   int nodeid)
{
	struct slab *slabp;

	if (OFF_SLAB(cachep)) {
		/* Slab management obj is off-slab. */
		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
					      local_flags, nodeid);
		/*
		 * If the first object in the slab is leaked (it's allocated
		 * but no one has a reference to it), we want to make sure
		 * kmemleak does not treat the ->s_mem pointer as a reference
		 * to the object. Otherwise we will not report the leak.
		 */
		kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
				   local_flags);
		if (!slabp)
			return NULL;
	} else {
		slabp = objp + colour_off;
		colour_off += cachep->slab_size;
	}
	slabp->inuse = 0;
	slabp->colouroff = colour_off;
	slabp->s_mem = objp + colour_off;记录object开始的地址
	slabp->nodeid = nodeid;
	slabp->free = 0;
	return slabp;
}
static void cache_init_objs(struct kmem_cache *cachep,
			    struct slab *slabp)
{
	int i;

	for (i = 0; i < cachep->num; i++) {
		void *objp = index_to_obj(cachep, slabp, i);
		if (cachep->ctor)
			cachep->ctor(objp);
		slab_bufctl(slabp)[i] = i + 1;
	} 初始化slab管理结构后面的kmem_bufctl_t数组,该数组的值代表每个object的index
	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
}

所以初始化好以后,slabs_free中已经有值,在执行之前的retry:

while (slabp->inuse < cachep->num && batchcount--) {
			STATS_INC_ALLOCED(cachep);
			STATS_INC_ACTIVE(cachep);
			STATS_SET_HIGH(cachep);

			ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
									node));
		}

batchcount为1,只释放一个object。ac_put_obj把object从slab放入array中,看下slab_get_obj:

static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
				int nodeid)
{
	void *objp = index_to_obj(cachep, slabp, slabp->free);
	kmem_bufctl_t next;

	slabp->inuse++;
	next = slab_bufctl(slabp)[slabp->free];
#if DEBUG
	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
	WARN_ON(slabp->nodeid != nodeid);
#endif
	slabp->free = next;

	return objp;
}

可以看到每次更新slabp->free的值,而这个值从kmem_bufctl_t数组结构中取,数组中的值应该是这样的kmem_bufctl[free]=free+1,index为n的值为n+1,指向下一个object。这样array中就有object了。返回kmem_cache的空间。在create_kmalloc_cache中,又调用create_boot_cache,创建kmalloc-ac的slab。这边调用需要注意下:

create_boot_cache

     -------->__kmem_cache_create

          ---------->setup_cpu_cache

在setup_cpu_cache中,走slab_state == PARTIAL这一支。

 else if (slab_state == PARTIAL) {
		/*
		 * Note: the second kmem_cache_create must create the cache
		 * that's used by kmalloc(24), otherwise the creation of
		 * further caches will BUG().
		 */
		cachep->array[smp_processor_id()] = &initarray_generic.cache;

		/*
		 * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
		 * the second cache, then we need to set up all its node/,
		 * otherwise the creation of further caches will BUG().
		 */
		set_up_node(cachep, SIZE_AC);
		if (INDEX_AC == INDEX_NODE)
			slab_state = PARTIAL_NODE;
		else
			slab_state = PARTIAL_ARRAYCACHE;
	} 

因为cachep->array的slab正在创建,所以这边还是只能用静态的,然后调用set_up_node,从init_kmem_cache_node数组中拿一个node赋值给cachep->node[0],最后设置slab_state = PARTIAL_ARRAYCACHE;这样kmalloc-ac的初始化就做完了,再看一下kmalloc-node的初始化有什么不同,这个时候,kmem_cache和array_cache的slab都已经初始化完毕了,所以只有kmem_cache_node才需要使用静态变量。初始化过程和上面基本一样,重点看下这里:

create_boot_cache

     -------->__kmem_cache_create

          ---------->setup_cpu_cache

{
		/* Remaining boot caches */
		cachep->array[smp_processor_id()] =
			kmalloc(sizeof(struct arraycache_init), gfp);

		if (slab_state == PARTIAL_ARRAYCACHE) {
			set_up_node(cachep, SIZE_NODE);
			slab_state = PARTIAL_NODE;
		} else {
			int node;
			for_each_online_node(node) {
				cachep->node[node] =
				    kmalloc_node(sizeof(struct kmem_cache_node),
						gfp, node);
				BUG_ON(!cachep->node[node]);
				kmem_cache_node_init(cachep->node[node]);
			}
		}
	}

可以看到cachep->array通过kmalloc动态分配,而kmem_cache_node还是使用静态设置,和之前描述的一样。

kmalloc

   ------>__kmalloc

         -------->__do_kmalloc

static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
					  unsigned long caller)
{
	struct kmem_cache *cachep;
	void *ret;

	/* If you want to save a few bytes .text space: replace
	 * __ with kmem_.
	 * Then kmalloc uses the uninlined functions instead of the inline
	 * functions.
	 */
	cachep = kmalloc_slab(size, flags); 获取可以分配当前szie空间的kmem_cache 结构
	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
		return cachep;
	ret = slab_alloc(cachep, flags, caller);从kmem_cache 结构中获取object

	trace_kmalloc(caller, ret,
		      size, cachep->size, flags);

	return ret;
}

该函数先用kmalloc_slab获取kmem_cache结构:

struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
	int index;

	if (size > KMALLOC_MAX_SIZE) {
		WARN_ON_ONCE(!(flags & __GFP_NOWARN));
		return NULL;
	}

	if (size <= 192) {
		if (!size)
			return ZERO_SIZE_PTR;

		index = size_index[size_index_elem(size)];
	} else
		index = fls(size - 1);  根据size算出index

#ifdef CONFIG_ZONE_DMA
	if (unlikely((flags & GFP_DMA)))
		return kmalloc_dma_caches[index];

#endif
        利用index从kmalloc_caches中查找
	return kmalloc_caches[index];
}

前面已经在kmalloc_caches中放入了初始化的kmalloc-ac,这边就返回kmalloc-ac的kmem_cache结构,然后从中分配object。这样slab系统的一些基本结构都已经初始化完毕,但是里面使用了很多静态结构,所以普在kmem_cache_init函数后半段,用动态分配取代这部分静态结构:

/* 4) Replace the bootstrap head arrays */
	{
		struct array_cache *ptr;

		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);

		memcpy(ptr, cpu_cache_get(kmem_cache),
		       sizeof(struct arraycache_init));
		/*
		 * Do not assume that spinlocks can be initialized via memcpy:
		 */
		spin_lock_init(&ptr->lock);

		kmem_cache->array[smp_processor_id()] = ptr;

		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);

		BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
		       != &initarray_generic.cache);
		memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
		       sizeof(struct arraycache_init));
		/*
		 * Do not assume that spinlocks can be initialized via memcpy:
		 */
		spin_lock_init(&ptr->lock);

		kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
	}
	/* 5) Replace the bootstrap kmem_cache_node */
	{
		int nid;

		for_each_online_node(nid) {
			init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);

			init_list(kmalloc_caches[INDEX_AC],
				  &init_kmem_cache_node[SIZE_AC + nid], nid);

			if (INDEX_AC != INDEX_NODE) {
				init_list(kmalloc_caches[INDEX_NODE],
					  &init_kmem_cache_node[SIZE_NODE + nid], nid);
			}
		}
	}

最后调用create_kmalloc_caches:

void __init create_kmalloc_caches(unsigned long flags)
{
	int i;

	/*
	 * Patch up the size_index table if we have strange large alignment
	 * requirements for the kmalloc array. This is only the case for
	 * MIPS it seems. The standard arches will not generate any code here.
	 *
	 * Largest permitted alignment is 256 bytes due to the way we
	 * handle the index determination for the smaller caches.
	 *
	 * Make sure that nothing crazy happens if someone starts tinkering
	 * around with ARCH_KMALLOC_MINALIGN
	 */
	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
		(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));

	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
		int elem = size_index_elem(i);

		if (elem >= ARRAY_SIZE(size_index))
			break;
		size_index[elem] = KMALLOC_SHIFT_LOW;
	}

	if (KMALLOC_MIN_SIZE >= 64) {
		/*
		 * The 96 byte size cache is not used if the alignment
		 * is 64 byte.
		 */
		for (i = 64 + 8; i <= 96; i += 8)
			size_index[size_index_elem(i)] = 7;

	}

	if (KMALLOC_MIN_SIZE >= 128) {
		/*
		 * The 192 byte sized cache is not used if the alignment
		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
		 * instead.
		 */
		for (i = 128 + 8; i <= 192; i += 8)
			size_index[size_index_elem(i)] = 8;
	}
	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
		if (!kmalloc_caches[i]) {
			kmalloc_caches[i] = create_kmalloc_cache(NULL,
							1 << i, flags);
		}

		/*
		 * Caches that are not of the two-to-the-power-of size.
		 * These have to be created immediately after the
		 * earlier power of two caches
		 */
		if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
			kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);

		if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
			kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
	}

	/* Kmalloc array is now usable */
	slab_state = UP;

	for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
		struct kmem_cache *s = kmalloc_caches[i];
		char *n;

		if (s) {
			n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));

			BUG_ON(!n);
			s->name = n;
		}
	}

#ifdef CONFIG_ZONE_DMA
	for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
		struct kmem_cache *s = kmalloc_caches[i];

		if (s) {
			int size = kmalloc_size(i);
			char *n = kasprintf(GFP_NOWAIT,
				 "dma-kmalloc-%d", size);

			BUG_ON(!n);
			kmalloc_dma_caches[i] = create_kmalloc_cache(n,
				size, SLAB_CACHE_DMA | flags);
		}
	}
#endif
}

上面函数应该是初始化某些size的slab,然后放入kmalloc_caches数组中,后面要申请属于这个size范围的slab,就可以直接索引kmalloc_caches数组的元素,进行分配了。至此,slab系统初始化完毕。可以用一个图来描述主要结构相互之间的关系:

下面贴出一个博文,作为借鉴:

https://www.jianshu.com/p/95d68389fbd1

http://blog.chinaunix.net/uid-20786208-id-4831194.html

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值