小张学linux内核: 9. slab分配器

概述

slab分配器,不是按页进行分配的,而是按字节来分配的。slab分配器向页框分配器也就是伙伴系统申请page,然后进行自己的管理,进行字节的分配,slab分配器为"打通铺"的分配思想。kmalloc,和内核的一些专有的数据结构就是slab分配器管理的。slab分配器最小的粒度是8字节,也只能分配8字节为倍数的内存,申请小于8字节也会按8字节进行分配。

特征

slab分配器,slab分配器,按照每一个字节大小,维持了3个列表,即满,半满,空的page链表,分配的优先级是先从半满的链表分配,然后是空,如果空的page链表也没有,则从伙伴系统申请一页加入空的page链表。每个size大小为一个slab,用结构体kmem_cache表示。
在这里插入图片描述
每个slab节点使用结构体kmem_cache_node 表示
kmem_cache_node 是适配numa架构,每个cpu有一个node。

struct kmem_cache_node {
	spinlock_t list_lock;

#ifdef CONFIG_SLAB
	struct list_head slabs_partial;	/* partial list first, better asm code */
	struct list_head slabs_full;
	struct list_head slabs_free;
	unsigned long total_slabs;	/* length of all slab lists */
	unsigned long free_slabs;	/* length of free slab list only */
	unsigned long free_objects;
	unsigned int free_limit;
	unsigned int colour_next;	/* Per-node cache coloring */
	struct array_cache *shared;	/* shared per node */
	struct alien_cache **alien;	/* on other nodes */
	unsigned long next_reap;	/* updated without locking */
	int free_touched;		/* updated without locking */
#endif
};

kmem_cache 结构是整个系统的描述

struct kmem_cache {
	struct array_cache __percpu *cpu_cache;        /*percpu 变量, 相当于快表*/

/* 1) Cache tunables. Protected by slab_mutex */
	unsigned int batchcount;
	unsigned int limit;
	unsigned int shared;

	unsigned int size;               /*表示大小*/
	struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */

	slab_flags_t flags;		/* constant flags */
	unsigned int num;		/* # of objs per slab */

/* 3) cache_grow/shrink */
	/* order of pgs per slab (2^n) */
	unsigned int gfporder;

	/* force GFP flags, e.g. GFP_DMA */
	gfp_t allocflags;

	size_t colour;			/* cache colouring range */
	unsigned int colour_off;	/* colour offset */
	struct kmem_cache *freelist_cache;
	unsigned int freelist_size;

	/* constructor func */
	void (*ctor)(void *obj);

/* 4) cache creation/removal */
	const char *name;
	struct list_head list;
	int refcount;
	int object_size;
	int align;

....

	/*
	 * If debugging is enabled, then the allocator can add additional
	 * fields and/or padding to every object. 'size' contains the total
	 * object size including these internal fields, while 'obj_offset'
	 * and 'object_size' contain the offset to the user object and its
	 * size.
	 */
	int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */

#ifdef CONFIG_MEMCG
	struct memcg_cache_params memcg_params;
#endif
#ifdef CONFIG_KASAN
	struct kasan_cache kasan_info;
#endif

#ifdef CONFIG_SLAB_FREELIST_RANDOM
	unsigned int *random_seq;
#endif

	unsigned int useroffset;	/* Usercopy region offset */
	unsigned int usersize;		/* Usercopy region size */

	struct kmem_cache_node *node[MAX_NUMNODES];      /*适配numa架构*/
};

slab的初始化

在这里插入图片描述
来看kmem_cache_init代码创建kmalloc的slab
kmalloc_caches

extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
void __init kmem_cache_init(void)
{
kmalloc_caches[INDEX_NODE] = create_kmalloc_cache(
				kmalloc_info[INDEX_NODE].name,
				kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
				0, kmalloc_size(INDEX_NODE));
}

kmalloc_info是描述kmalloc信息的数组 mm/slab_common.h中

const struct kmalloc_info_struct kmalloc_info[] __initconst = {
	{NULL,                      0},		{"kmalloc-96",             96},
	{"kmalloc-192",           192},		{"kmalloc-8",               8},
	{"kmalloc-16",             16},		{"kmalloc-32",             32},
	{"kmalloc-64",             64},		{"kmalloc-128",           128},
	{"kmalloc-256",           256},		{"kmalloc-512",           512},
	{"kmalloc-1024",         1024},		{"kmalloc-2048",         2048},
	{"kmalloc-4096",         4096},		{"kmalloc-8192",         8192},
	{"kmalloc-16384",       16384},		{"kmalloc-32768",       32768},
	{"kmalloc-65536",       65536},		{"kmalloc-131072",     131072},
	{"kmalloc-262144",     262144},		{"kmalloc-524288",     524288},
	{"kmalloc-1048576",   1048576},		{"kmalloc-2097152",   2097152},
	{"kmalloc-4194304",   4194304},		{"kmalloc-8388608",   8388608},
	{"kmalloc-16777216", 16777216},		{"kmalloc-33554432", 33554432},
	{"kmalloc-67108864", 67108864}
};

可以看到这些info信息,不是按照从小到大的顺序排列的,这是为何呢?
待研究
kmalloc最大是64MB。
大小和下标的对应表,只有192大小以内的

static u8 size_index[24] __ro_after_init = {
	3,	/* 8 */
	4,	/* 16 */
	5,	/* 24 */
	5,	/* 32 */
	6,	/* 40 */
	6,	/* 48 */
	6,	/* 56 */
	6,	/* 64 */
	1,	/* 72 */
	1,	/* 80 */
	1,	/* 88 */
	1,	/* 96 */
	7,	/* 104 */
	7,	/* 112 */
	7,	/* 120 */
	7,	/* 128 */
	2,	/* 136 */
	2,	/* 144 */
	2,	/* 152 */
	2,	/* 160 */
	2,	/* 168 */
	2,	/* 176 */
	2,	/* 184 */
	2	/* 192 */
};

192大小以外的使用函数fls

static __always_inline int fls(int x)
{
	return x ? sizeof(x) * 8 - __builtin_clz(x) : 0;
}

分配过程

在这里插入图片描述

kmalloc_slab查找大小对应的slab,即kmem_cache结构,直接返回kmalloc_caches[index];

struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
	unsigned int index;

	if (size <= 192) {
		if (!size)
			return ZERO_SIZE_PTR;

		index = size_index[size_index_elem(size)];
	} else {
		if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
			WARN_ON(1);
			return NULL;
		}
		index = fls(size - 1);
	}

#ifdef CONFIG_ZONE_DMA
	if (unlikely((flags & GFP_DMA)))
		return kmalloc_dma_caches[index];

#endif
	return kmalloc_caches[index];
}

二在slab中分配内存
直接看____cache_alloc函数

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
	void *objp;
	struct array_cache *ac;     

	check_irq_off();

	ac = cpu_cache_get(cachep);     /*获取percpu变量 cachep->cpu_cache*/
	if (likely(ac->avail)) {      /*可获得*/
		ac->touched = 1;
		objp = ac->entry[--ac->avail];    /*将一页按大小分成好几段,存储在该数组中,从高到低存储*/

		STATS_INC_ALLOCHIT(cachep);
		goto out;
	}

	STATS_INC_ALLOCMISS(cachep);
	objp = cache_alloc_refill(cachep, flags);    /**重新分配/
	/*
	 * the 'ac' may be updated by cache_alloc_refill(),
	 * and kmemleak_erase() requires its correct value.
	 */
	ac = cpu_cache_get(cachep);     /*获取cpu_cache*/

out:
	/*
	 * To avoid a false negative, if an object that is in one of the
	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
	 * treat the array pointers as a reference to the object.
	 */
	if (objp)
		kmemleak_erase(&ac->entry[ac->avail]);     /*以获取到内存,则是cpu_cache对应项清空*/
	return objp;
}

____cache_alloc分4步

  1. 尝试从kmem_cache->cpu_cache中获取内存数据,cpu_cache->entry[] 数组中保存着将一页按大小分成若干分的地址信息,从高往低存储。
    array_cache 结构体保存快表,快速获取可用地址,快表没法保存一整页的地址,最多保存batchcount个,batchcount最大个数BATCHREFILL_LIMIT,即16个。
struct array_cache {
	unsigned int avail;     /*可用数量,*/
	unsigned int limit;
	unsigned int batchcount;      /*entry的数量*/
	unsigned int touched;    /*是否和slab分配器中的一页绑定*/
	void *entry[];	/*
			 * Must have this definition in here for the proper
			 * alignment of array_cache. Also simplifies accessing
			 * the entries.
			 */
};

2. 如果在cpu缓存中找不到可用数据,则重新填cpu_cache, 并更新slab状态。
3. 将cpu_cache对应项清零
重点来看填充cpu_cache项的cache_alloc_refill

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
	int batchcount;
	struct kmem_cache_node *n;
	struct array_cache *ac, *shared;
	int node;
	void *list = NULL;
	struct page *page;

	check_irq_off();
	node = numa_mem_id();

	ac = cpu_cache_get(cachep);
	batchcount = ac->batchcount;
	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
		/*
		 * If there was little recent activity on this cache, then
		 * perform only a partial refill.  Otherwise we could generate
		 * refill bouncing.
		 */
		batchcount = BATCHREFILL_LIMIT;    /*最大16个*/
	}
	n = get_node(cachep, node);

	BUG_ON(ac->avail > 0 || !n);
	shared = READ_ONCE(n->shared);
	if (!n->free_objects && (!shared || !shared->avail))
		goto direct_grow;

	spin_lock(&n->list_lock);
	shared = READ_ONCE(n->shared);

	/* See if we can refill from the shared array */
	if (shared && transfer_objects(ac, shared, batchcount)) {
		shared->touched = 1;
		goto alloc_done;
	}

	while (batchcount > 0) {
		/* Get slab alloc is to come from. */
		page = get_first_slab(n, false);        /*获取一个page,优先partial链表*/
		if (!page)
			goto must_grow;

		check_spinlock_acquired(cachep);
		/*分配块,并填充cpu_cache->entry数组*/
		batchcount = alloc_block(cachep, ac, page, batchcount);
		fixup_slab_list(cachep, n, page, &list);    /*修正slab,如果满了,则将partil移动到full链表*/
	}

must_grow:
	n->free_objects -= ac->avail;
alloc_done:
	spin_unlock(&n->list_lock);
	fixup_objfreelist_debug(cachep, &list);

direct_grow:
	if (unlikely(!ac->avail)) {
		/* Check if we can use obj in pfmemalloc slab */
		if (sk_memalloc_socks()) {
			void *obj = cache_alloc_pfmemalloc(cachep, n, flags);

			if (obj)
				return obj;
		}

		page = cache_grow_begin(cachep, gfp_exact_node(flags), node);

		/*
		 * cache_grow_begin() can reenable interrupts,
		 * then ac could change.
		 */
		ac = cpu_cache_get(cachep);
		if (!ac->avail && page)
			alloc_block(cachep, ac, page, batchcount);
		cache_grow_end(cachep, page);

		if (!ac->avail)
			return NULL;
	}
	ac->touched = 1;

	return ac->entry[--ac->avail];
}

get_first_slab函数获取第一个page,优先partial链表,然后是free。

static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
	struct page *page;

	assert_spin_locked(&n->list_lock);
	page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
	if (!page) {
		n->free_touched = 1;
		page = list_first_entry_or_null(&n->slabs_free, struct page,
						lru);
		if (page)
			n->free_slabs--;
	}

	if (sk_memalloc_socks())
		page = get_valid_first_slab(n, page, pfmemalloc);

	return page;
}

alloc_block填充cpu_cache->entry数组

static __always_inline int alloc_block(struct kmem_cache *cachep,
		struct array_cache *ac, struct page *page, int batchcount)
{
	/*
	 * There must be at least one object available for
	 * allocation.
	 */
	BUG_ON(page->active >= cachep->num);

	while (page->active < cachep->num && batchcount--) {
		STATS_INC_ALLOCED(cachep);
		STATS_INC_ACTIVE(cachep);
		STATS_SET_HIGH(cachep);

		ac->entry[ac->avail++] = slab_get_obj(cachep, page);
	}
	return batchcount;
}

fixup_slab_list修正slab状态

static inline void fixup_slab_list(struct kmem_cache *cachep,
				struct kmem_cache_node *n, struct page *page,
				void **list)
{
	/* move slabp to correct slabp list: */
	list_del(&page->lru);
	if (page->active == cachep->num) {
	/*如果页的激活次数和该slab的一页的块数相同,则添加到满的链表中*/
		list_add(&page->lru, &n->slabs_full);
		if (OBJFREELIST_SLAB(cachep)) {
#if DEBUG
			/* Poisoning will be done without holding the lock */
			if (cachep->flags & SLAB_POISON) {
				void **objp = page->freelist;

				*objp = *list;
				*list = objp;
			}
#endif
			page->freelist = NULL;
		}
	} else
		list_add(&page->lru, &n->slabs_partial);   /*否则加入partial链表*/
}

page->active是专为slab分配器使用的,每分配一个block,则active加1,弱等于slab分配器一页的总block数(4K / size) cachep->num, 则表示该page已分配完,加入full的链表。

slab_get_obj()函数会使page->active++;加入cpu_cache的block相当于已经分配了,所以在分配内存时,不是一次从slab中分配一和block,而是分配多个block,将其加入的cpu缓存中,等下次再分配时就直接从cpu_cache获取即可。

static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
{
	void *objp;

	objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
	page->active++;

#if DEBUG
	if (cachep->flags & SLAB_STORE_USER)
		set_store_user_dirty(cachep);
#endif

	return objp;
}

** 写在最后,学习linux可能对工作起不到立竿见影的效果,但是会使你的困惑变少,将未知的事情搞清楚,难道不是很快乐的事吗?做你所爱的事,爱你所做的事。晚安,打工人**

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值