概述
slab分配器,不是按页进行分配的,而是按字节来分配的。slab分配器向页框分配器也就是伙伴系统申请page,然后进行自己的管理,进行字节的分配,slab分配器为"打通铺"的分配思想。kmalloc,和内核的一些专有的数据结构就是slab分配器管理的。slab分配器最小的粒度是8字节,也只能分配8字节为倍数的内存,申请小于8字节也会按8字节进行分配。
特征
slab分配器,slab分配器,按照每一个字节大小,维持了3个列表,即满,半满,空的page链表,分配的优先级是先从半满的链表分配,然后是空,如果空的page链表也没有,则从伙伴系统申请一页加入空的page链表。每个size大小为一个slab,用结构体kmem_cache表示。
每个slab节点使用结构体kmem_cache_node 表示
kmem_cache_node 是适配numa架构,每个cpu有一个node。
struct kmem_cache_node {
spinlock_t list_lock;
#ifdef CONFIG_SLAB
struct list_head slabs_partial; /* partial list first, better asm code */
struct list_head slabs_full;
struct list_head slabs_free;
unsigned long total_slabs; /* length of all slab lists */
unsigned long free_slabs; /* length of free slab list only */
unsigned long free_objects;
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
struct array_cache *shared; /* shared per node */
struct alien_cache **alien; /* on other nodes */
unsigned long next_reap; /* updated without locking */
int free_touched; /* updated without locking */
#endif
};
kmem_cache 结构是整个系统的描述
struct kmem_cache {
struct array_cache __percpu *cpu_cache; /*percpu 变量, 相当于快表*/
/* 1) Cache tunables. Protected by slab_mutex */
unsigned int batchcount;
unsigned int limit;
unsigned int shared;
unsigned int size; /*表示大小*/
struct reciprocal_value reciprocal_buffer_size;
/* 2) touched by every alloc & free from the backend */
slab_flags_t flags; /* constant flags */
unsigned int num; /* # of objs per slab */
/* 3) cache_grow/shrink */
/* order of pgs per slab (2^n) */
unsigned int gfporder;
/* force GFP flags, e.g. GFP_DMA */
gfp_t allocflags;
size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
struct kmem_cache *freelist_cache;
unsigned int freelist_size;
/* constructor func */
void (*ctor)(void *obj);
/* 4) cache creation/removal */
const char *name;
struct list_head list;
int refcount;
int object_size;
int align;
....
/*
* If debugging is enabled, then the allocator can add additional
* fields and/or padding to every object. 'size' contains the total
* object size including these internal fields, while 'obj_offset'
* and 'object_size' contain the offset to the user object and its
* size.
*/
int obj_offset;
#endif /* CONFIG_DEBUG_SLAB */
#ifdef CONFIG_MEMCG
struct memcg_cache_params memcg_params;
#endif
#ifdef CONFIG_KASAN
struct kasan_cache kasan_info;
#endif
#ifdef CONFIG_SLAB_FREELIST_RANDOM
unsigned int *random_seq;
#endif
unsigned int useroffset; /* Usercopy region offset */
unsigned int usersize; /* Usercopy region size */
struct kmem_cache_node *node[MAX_NUMNODES]; /*适配numa架构*/
};
slab的初始化
来看kmem_cache_init代码创建kmalloc的slab
kmalloc_caches
extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
void __init kmem_cache_init(void)
{
kmalloc_caches[INDEX_NODE] = create_kmalloc_cache(
kmalloc_info[INDEX_NODE].name,
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
0, kmalloc_size(INDEX_NODE));
}
kmalloc_info是描述kmalloc信息的数组 mm/slab_common.h中
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
{NULL, 0}, {"kmalloc-96", 96},
{"kmalloc-192", 192}, {"kmalloc-8", 8},
{"kmalloc-16", 16}, {"kmalloc-32", 32},
{"kmalloc-64", 64}, {"kmalloc-128", 128},
{"kmalloc-256", 256}, {"kmalloc-512", 512},
{"kmalloc-1024", 1024}, {"kmalloc-2048", 2048},
{"kmalloc-4096", 4096}, {"kmalloc-8192", 8192},
{"kmalloc-16384", 16384}, {"kmalloc-32768", 32768},
{"kmalloc-65536", 65536}, {"kmalloc-131072", 131072},
{"kmalloc-262144", 262144}, {"kmalloc-524288", 524288},
{"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152},
{"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608},
{"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432},
{"kmalloc-67108864", 67108864}
};
可以看到这些info信息,不是按照从小到大的顺序排列的,这是为何呢?
待研究
kmalloc最大是64MB。
大小和下标的对应表,只有192大小以内的
static u8 size_index[24] __ro_after_init = {
3, /* 8 */
4, /* 16 */
5, /* 24 */
5, /* 32 */
6, /* 40 */
6, /* 48 */
6, /* 56 */
6, /* 64 */
1, /* 72 */
1, /* 80 */
1, /* 88 */
1, /* 96 */
7, /* 104 */
7, /* 112 */
7, /* 120 */
7, /* 128 */
2, /* 136 */
2, /* 144 */
2, /* 152 */
2, /* 160 */
2, /* 168 */
2, /* 176 */
2, /* 184 */
2 /* 192 */
};
192大小以外的使用函数fls
static __always_inline int fls(int x)
{
return x ? sizeof(x) * 8 - __builtin_clz(x) : 0;
}
分配过程
kmalloc_slab查找大小对应的slab,即kmem_cache结构,直接返回kmalloc_caches[index];
struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
unsigned int index;
if (size <= 192) {
if (!size)
return ZERO_SIZE_PTR;
index = size_index[size_index_elem(size)];
} else {
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
WARN_ON(1);
return NULL;
}
index = fls(size - 1);
}
#ifdef CONFIG_ZONE_DMA
if (unlikely((flags & GFP_DMA)))
return kmalloc_dma_caches[index];
#endif
return kmalloc_caches[index];
}
二在slab中分配内存
直接看____cache_alloc函数
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
check_irq_off();
ac = cpu_cache_get(cachep); /*获取percpu变量 cachep->cpu_cache*/
if (likely(ac->avail)) { /*可获得*/
ac->touched = 1;
objp = ac->entry[--ac->avail]; /*将一页按大小分成好几段,存储在该数组中,从高到低存储*/
STATS_INC_ALLOCHIT(cachep);
goto out;
}
STATS_INC_ALLOCMISS(cachep);
objp = cache_alloc_refill(cachep, flags); /**重新分配/
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
*/
ac = cpu_cache_get(cachep); /*获取cpu_cache*/
out:
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
* treat the array pointers as a reference to the object.
*/
if (objp)
kmemleak_erase(&ac->entry[ac->avail]); /*以获取到内存,则是cpu_cache对应项清空*/
return objp;
}
____cache_alloc分4步
- 尝试从kmem_cache->cpu_cache中获取内存数据,cpu_cache->entry[] 数组中保存着将一页按大小分成若干分的地址信息,从高往低存储。
array_cache 结构体保存快表,快速获取可用地址,快表没法保存一整页的地址,最多保存batchcount个,batchcount最大个数BATCHREFILL_LIMIT,即16个。
struct array_cache {
unsigned int avail; /*可用数量,*/
unsigned int limit;
unsigned int batchcount; /*entry的数量*/
unsigned int touched; /*是否和slab分配器中的一页绑定*/
void *entry[]; /*
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
*/
};
2. 如果在cpu缓存中找不到可用数据,则重新填cpu_cache, 并更新slab状态。
3. 将cpu_cache对应项清零
重点来看填充cpu_cache项的cache_alloc_refill
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_cache_node *n;
struct array_cache *ac, *shared;
int node;
void *list = NULL;
struct page *page;
check_irq_off();
node = numa_mem_id();
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT; /*最大16个*/
}
n = get_node(cachep, node);
BUG_ON(ac->avail > 0 || !n);
shared = READ_ONCE(n->shared);
if (!n->free_objects && (!shared || !shared->avail))
goto direct_grow;
spin_lock(&n->list_lock);
shared = READ_ONCE(n->shared);
/* See if we can refill from the shared array */
if (shared && transfer_objects(ac, shared, batchcount)) {
shared->touched = 1;
goto alloc_done;
}
while (batchcount > 0) {
/* Get slab alloc is to come from. */
page = get_first_slab(n, false); /*获取一个page,优先partial链表*/
if (!page)
goto must_grow;
check_spinlock_acquired(cachep);
/*分配块,并填充cpu_cache->entry数组*/
batchcount = alloc_block(cachep, ac, page, batchcount);
fixup_slab_list(cachep, n, page, &list); /*修正slab,如果满了,则将partil移动到full链表*/
}
must_grow:
n->free_objects -= ac->avail;
alloc_done:
spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
direct_grow:
if (unlikely(!ac->avail)) {
/* Check if we can use obj in pfmemalloc slab */
if (sk_memalloc_socks()) {
void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
if (obj)
return obj;
}
page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
/*
* cache_grow_begin() can reenable interrupts,
* then ac could change.
*/
ac = cpu_cache_get(cachep);
if (!ac->avail && page)
alloc_block(cachep, ac, page, batchcount);
cache_grow_end(cachep, page);
if (!ac->avail)
return NULL;
}
ac->touched = 1;
return ac->entry[--ac->avail];
}
get_first_slab函数获取第一个page,优先partial链表,然后是free。
static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;
assert_spin_locked(&n->list_lock);
page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
if (!page) {
n->free_touched = 1;
page = list_first_entry_or_null(&n->slabs_free, struct page,
lru);
if (page)
n->free_slabs--;
}
if (sk_memalloc_socks())
page = get_valid_first_slab(n, page, pfmemalloc);
return page;
}
alloc_block填充cpu_cache->entry数组
static __always_inline int alloc_block(struct kmem_cache *cachep,
struct array_cache *ac, struct page *page, int batchcount)
{
/*
* There must be at least one object available for
* allocation.
*/
BUG_ON(page->active >= cachep->num);
while (page->active < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
ac->entry[ac->avail++] = slab_get_obj(cachep, page);
}
return batchcount;
}
fixup_slab_list修正slab状态
static inline void fixup_slab_list(struct kmem_cache *cachep,
struct kmem_cache_node *n, struct page *page,
void **list)
{
/* move slabp to correct slabp list: */
list_del(&page->lru);
if (page->active == cachep->num) {
/*如果页的激活次数和该slab的一页的块数相同,则添加到满的链表中*/
list_add(&page->lru, &n->slabs_full);
if (OBJFREELIST_SLAB(cachep)) {
#if DEBUG
/* Poisoning will be done without holding the lock */
if (cachep->flags & SLAB_POISON) {
void **objp = page->freelist;
*objp = *list;
*list = objp;
}
#endif
page->freelist = NULL;
}
} else
list_add(&page->lru, &n->slabs_partial); /*否则加入partial链表*/
}
page->active是专为slab分配器使用的,每分配一个block,则active加1,弱等于slab分配器一页的总block数(4K / size) cachep->num, 则表示该page已分配完,加入full的链表。
slab_get_obj()函数会使page->active++;加入cpu_cache的block相当于已经分配了,所以在分配内存时,不是一次从slab中分配一和block,而是分配多个block,将其加入的cpu缓存中,等下次再分配时就直接从cpu_cache获取即可。
static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
{
void *objp;
objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
page->active++;
#if DEBUG
if (cachep->flags & SLAB_STORE_USER)
set_store_user_dirty(cachep);
#endif
return objp;
}
** 写在最后,学习linux可能对工作起不到立竿见影的效果,但是会使你的困惑变少,将未知的事情搞清楚,难道不是很快乐的事吗?做你所爱的事,爱你所做的事。晚安,打工人**