linux内存管理 cache,linux内存管理之kmem_cache_init

最新推荐文章于 2022-11-17 16:31:28 发布

CRH380B3666

最新推荐文章于 2022-11-17 16:31:28 发布

阅读量456

点赞数

文章标签： linux内存管理 cache

kmem_cache_boot则是：

点击(此处)折叠或打开

/* internal cache of cache description objs */

static struct kmem_cache kmem_cache_boot = {

.batchcount = 1,

.limit = BOOT_CPUCACHE_ENTRIES, // 默认为 1

.shared = 1,

.size = sizeof(struct kmem_cache),

.name = "kmem_cache",

};

注释解释的已经很清晰了.

而setup_nodelists_pointer的作用就是把struct kmem_cache里array指针地址存放在nodelists.目的是为了便于操作指针.

对于一致性内存访问，inode只有一个.

点击(此处)折叠或打开

static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];

它是slab.c中静态全局变量

点击(此处)折叠或打开

* Need this for bootstrapping a per node allocator.

kmem_list3_init初始化slab的三个链表slabs_full、slabs_partial、slabs_free.为什么初始化这个和cache组成结构有关系，可以看个图：

这里CACHE_CACHE在文件的开头部分被定义为0.

点击(此处)折叠或打开

* For setting up all the kmem_list3s for cache whose buffer_size is same as

* size of kmem_list3.

static void __init set_up_list3s(struct kmem_cache *cachep, int index)

{

int node;

for_each_online_node(node) {

cachep->nodelists[node] = &initkmem_list3[index + node];

cachep->nodelists[node]->next_reap = jiffies +

REAPTIMEOUT_LIST3 +

((unsigned long)cachep) % REAPTIMEOUT_LIST3;

}

接着就要开始了真正的创建cache的工作，并且给出了初始化步骤和说明：

点击(此处)折叠或打开

/* Bootstrap is tricky, because several objects are allocated

* from caches that do not exist yet:

* 1) initialize the kmem_cache cache: it contains the struct

* kmem_cache structures of all caches, except kmem_cache itself:

* kmem_cache is statically allocated.

* Initially an __init data area is used for the head array and the

* kmem_list3 structures, it's replaced with a kmalloc allocated

* array at the end of the bootstrap.

* 2) Create the first kmalloc cache.

* The struct kmem_cache for the new cache is allocated normally.

* An __init data area is used for the head array.

* 3) Create the remaining kmalloc caches, with minimally sized

* head arrays.

* 4) Replace the __init data head arrays for kmem_cache and the first

* kmalloc cache with kmalloc allocated arrays.

* 5) Replace the __init data for kmem_list3 for kmem_cache and

* the other cache's with kmalloc allocated memory.

* 6) Resize the head arrays of the kmalloc caches to their final sizes.

/* 1) create the kmem_cache */

* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids

create_boot_cache(kmem_cache, "kmem_cache",

offsetof(struct kmem_cache, array[nr_cpu_ids]) +

nr_node_ids * sizeof(struct kmem_list3 *),

SLAB_HWCACHE_ALIGN);

list_add(&kmem_cache->list, &slab_caches);

首先创建第一个cache它名为kmem_cache，并且kmem_cache指针变量指向了kmem_cache_boot.

下面我们看看create_boot_cache函数

点击(此处)折叠或打开

#ifndef CONFIG_SLOB

/* Create a cache during boot when no slab services are available yet */

void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,

unsigned long flags)

{

int err;

s->name = name;

s->size = s->object_size = size;

s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);

err = __kmem_cache_create(s, flags);

if (err)

panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",

name, size, err);

s->refcount = -1; /* Exempt from merging for now */

}

struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,

unsigned long flags)

{

struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);

if (!s)

panic("Out of memory when creating slab %s\n", name);

create_boot_cache(s, name, size, flags);

list_add(&s->list, &slab_caches);

s->refcount = 1;

return s;

}

#endif /* !CONFIG_SLOB */

而它接着调用了__kmem_cache_create：这是最关键的函数

点击(此处)折叠或打开

/**

* __kmem_cache_create - Create a cache.

* @cachep: cache management descriptor

* @flags: SLAB flags

* Returns a ptr to the cache on success, NULL on failure.

* Cannot be called within a int, but can be interrupted.

* The @ctor is run when new pages are allocated by the cache.

* The flags are

* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)

* to catch references to uninitialised memory.

* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check

* for buffer overruns.

* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware

* cacheline. This can be beneficial if you're counting cycles as closely

* as davem.

int

__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)

{

size_t left_over, slab_size, ralign;

gfp_t gfp;

int err;

size_t size = cachep->size;

#if DEBUG

#if FORCED_DEBUG

* Enable redzoning and last user accounting, except for caches with

* large objects, if the increased size would increase the object size

* above the next power of two: caches with object sizes just above a

* power of two have a significant amount of internal fragmentation.

if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +

2 * sizeof(unsigned long long)))

flags |= SLAB_RED_ZONE | SLAB_STORE_USER;

if (!(flags & SLAB_DESTROY_BY_RCU))

flags |= SLAB_POISON;

#endif

if (flags & SLAB_DESTROY_BY_RCU)

BUG_ON(flags & SLAB_POISON);

#endif

* Check that size is in terms of words. This is needed to avoid

* unaligned accesses for some archs when redzoning is used, and makes

* sure any on-slab bufctl's are also correctly aligned.

if (size & (BYTES_PER_WORD - 1)) {

size += (BYTES_PER_WORD - 1);

size &= ~(BYTES_PER_WORD - 1);

} //4//四字节对齐

* Redzoning and user store require word alignment or possibly larger.

* Note this will be overridden by architecture or caller mandated

* alignment if either is greater than BYTES_PER_WORD.

if (flags & SLAB_STORE_USER)

ralign = BYTES_PER_WORD;

if (flags & SLAB_RED_ZONE) {

ralign = REDZONE_ALIGN;

/* If redzoning, ensure that the second redzone is suitably

* aligned, by adjusting the object size accordingly. */

size += REDZONE_ALIGN - 1;

size &= ~(REDZONE_ALIGN - 1);

}

/* 3) caller mandated alignment */

if (ralign < cachep->align) {

ralign = cachep->align;

}

/* disable debug if necessary */

if (ralign > __alignof__(unsigned long long))

flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);

* 4) Store it.

cachep->align = ralign;

if (slab_is_available()) // 为什么要插入这一段注释，因为它就是判断slab_state的值，默认它的值没人初始化即为DOWN.

点击(此处)折叠或打开

* State of the slab allocator.

* This is used to describe the states of the allocator during bootup.

* Allocators use this to gradually bootstrap themselves. Most allocators

* have the problem that the structures used for managing slab caches are

* allocated from slab caches themselves.

enum slab_state {

DOWN, /* No slab functionality yet */

PARTIAL, /* SLUB: kmem_cache_node available */

PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */

PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */

UP, /* Slab caches usable but not all extras yet */

FULL /* Everything is working */

};

gfp = GFP_KERNEL;

else

gfp = GFP_NOWAIT;

//点击(此处)折叠或打开

#define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH)

setup_nodelists_pointer(cachep);

#if DEBUG

* Both debugging options require word-alignment which is calculated

* into align above.

if (flags & SLAB_RED_ZONE) {

/* add space for red zone words */

cachep->obj_offset += sizeof(unsigned long long);

size += 2 * sizeof(unsigned long long);

}

if (flags & SLAB_STORE_USER) {

/* user store requires one word storage behind the end of

* the real object. But if the second red zone needs to be

* aligned to 64 bits, we must allow that much space.

if (flags & SLAB_RED_ZONE)

size += REDZONE_ALIGN;

else

size += BYTES_PER_WORD;

}

#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)

if (size >= malloc_sizes[INDEX_L3 + 1].cs_size

&& cachep->object_size > cache_line_size()

&& ALIGN(size, cachep->align) < PAGE_SIZE) {

cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);

size = PAGE_SIZE;

}

#endif

* Determine if the slab management is 'on' or 'off' slab.

* (bootstrapping cannot cope with offslab caches so don't do // 判断slab管理信息是否在slab分配的内存页上，判断条件见下面：

* it too early on. Always use on-slab management when // size >= (默认page =4k/8k) 512/1024 ; slab_early_init在创建kmem_cache的时候为1；当创建通用cache

* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) //的时才会把它初始化为0 . 而第一传递的flags为 SLAB_HWCACHE_ALIGN

if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&

!(flags & SLAB_NOLEAKTRACE))

* Size is large, assume best to place the slab management obj

* off-slab (should allow better packing of objs).

flags |= CFLGS_OFF_SLAB;

size = ALIGN(size, cachep->align);

left_over = calculate_slab_order(cachep, size, cachep->align, flags); // 根据obj size 计算申请page的个数即一个slab包含多少个pages，

if (!cachep->num) // 也包含了多少个obj，除去管理信息等剩余的空间。很简单易懂.

return -E2BIG;

slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)

+ sizeof(struct slab), cachep->align);

* If the slab has been placed off-slab, and we have enough space then

* move it on-slab. This is at the expense of any extra colouring.

if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {

flags &= ~CFLGS_OFF_SLAB;

left_over -= slab_size;

}

if (flags & CFLGS_OFF_SLAB) {

/* really off slab. No need for manual alignment */

slab_size =

cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);

#ifdef CONFIG_PAGE_POISONING

/* If we're going to use the generic kernel_map_pages()

* poisoning, then it's going to smash the contents of

* the redzone and userword anyhow, so switch them off.

if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)

flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);

#endif

}

cachep->colour_off = cache_line_size(); //32B

/* Offset must be a multiple of the alignment. */

if (cachep->colour_off < cachep->align)

cachep->colour_off = cachep->align;

cachep->colour = left_over / cachep->colour_off; // slab 着色的初始化工作.

cachep->slab_size = slab_size;

cachep->flags = flags;

cachep->allocflags = 0;

if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))

cachep->allocflags |= GFP_DMA;

cachep->size = size;

cachep->reciprocal_buffer_size = reciprocal_value(size);

if (flags & CFLGS_OFF_SLAB) {

cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);

* This is a possibility for one of the malloc_sizes caches.

* But since we go off slab only for object size greater than

* PAGE_SIZE/8, and malloc_sizes gets created in ascending order,

* this should not happen at all.

* But leave a BUG_ON for some lucky dude.

BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));

}

err = setup_cpu_cache(cachep, gfp);

if (err) {

__kmem_cache_shutdown(cachep);

return err;

}

if (flags & SLAB_DEBUG_OBJECTS) {

* Would deadlock through slab_destroy()->call_rcu()->

* debug_object_activate()->kmem_cache_alloc().

WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);

slab_set_debugobj_lock_classes(cachep);

} else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))

on_slab_lock_classes(cachep);

return 0;

}

它里面有个很有趣的函数很关键的一个函数：它泄露了slab具体管理obj的布局和方法.

点击(此处)折叠或打开

/**

* calculate_slab_order - calculate size (page order) of slabs

* @cachep: pointer to the cache that is being created

* @size: size of objects to be created in this cache.

* @align: required alignment for the objects.

* @flags: slab allocation flags

* Also calculates the number of objects per slab.

* This could be made much more intelligent. For now, try to avoid using

* high order pages for slabs. When the gfp() functions are more friendly

* towards high-order requests, this should be changed.

static size_t calculate_slab_order(struct kmem_cache *cachep,

size_t size, size_t align, unsigned long flags)

{

unsigned long offslab_limit;

size_t left_over = 0;

int gfporder;

for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {

unsigned int num;

size_t remainder;

cache_estimate(gfporder, size, align, flags, &remainder, &num); // 根据是off-slab 还是on-slab除去管理信息后多少个页面才能存下一个obj.以及其他信息，值得仔细看看.

if (!num) // 必须保证slab至少能装下一个obj

continue;

if (flags & CFLGS_OFF_SLAB) {

* Max number of objs-per-slab for caches which

* use off-slab slabs. Needed to avoid a possible

* looping condition in cache_grow().

offslab_limit = size - sizeof(struct slab);

offslab_limit /= sizeof(kmem_bufctl_t);

if (num > offslab_limit)

break;

}

/* Found something acceptable - save it away */

cachep->num = num;

cachep->gfporder = gfporder;

left_over = remainder;

* A VFS-reclaimable slab tends to have most allocations

* as GFP_NOFS and we really don't want to have to be allocating

* higher-order pages when we are unable to shrink dcache.

if (flags & SLAB_RECLAIM_ACCOUNT)

break;

* Large number of objects is good, but very large slabs are

* currently bad for the gfp()s.

if (gfporder >= slab_max_order)

break;

* Acceptable internal fragmentation?

if (left_over * 8 <= (PAGE_SIZE << gfporder))

break;

}

return left_over;

}

经过上面的初始化和设置，最后调用setup_cpu_cache就完成了一个创建cache的工作.接着进行第2、3步的工作：

点击(此处)折叠或打开

/* 2+3) create the kmalloc caches */

sizes = malloc_sizes;

names = cache_names;

* Initialize the caches that provide memory for the array cache and the

* kmem_list3 structures first. Without this, further allocations will

* bug.

sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name, // create obj size 为sizeof(struct arraycache_init) 的cache

sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);

if (INDEX_AC != INDEX_L3)

sizes[INDEX_L3].cs_cachep =

create_kmalloc_cache(names[INDEX_L3].name, create obj size 为sizeof(struct kmem_list3) 的cache

sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);

slab_early_init = 0;

while (sizes->cs_size != ULONG_MAX) { //创建通用cache 根据malloc_sizes ，cache_names

* For performance, all the general caches are L1 aligned.

* This should be particularly beneficial on SMP boxes, as it

* eliminates "false sharing".

* Note for systems short on memory removing the alignment will

* allow tighter packing of the smaller caches.

if (!sizes->cs_cachep)

sizes->cs_cachep = create_kmalloc_cache(names->name,

sizes->cs_size, ARCH_KMALLOC_FLAGS);

#ifdef CONFIG_ZONE_DMA

sizes->cs_dmacachep = create_kmalloc_cache(

names->name_dma, sizes->cs_size,

SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);

#endif

sizes++;

names++;

}

这里在说一下cache_names和malloc_sizes：

点击(此处)折叠或打开

* These are the default caches for kmalloc. Custom caches can have other sizes.

struct cache_sizes malloc_sizes[] = {

#define CACHE(x) { .cs_size = (x) },

#include

CACHE(ULONG_MAX)

#undef CACHE

};

这里就不扩展开了.

点击(此处)折叠或打开

/* Must match cache_sizes above. Out of line to keep cache footprint low. */

struct cache_names {

char *name;

char *name_dma;

};

static struct cache_names __initdata cache_names[] = {

#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },

#include

{NULL,}

#undef CACHE

};

create_kmalloc_cache实际上是调用create_boot_cache. 把kernel预定义的通用cache创建一遍.之后我们进入第四步、第5步：

点击(此处)折叠或打开

/* 4) Replace the bootstrap head arrays */

{

struct array_cache *ptr;

ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);

memcpy(ptr, cpu_cache_get(kmem_cache),

sizeof(struct arraycache_init));

* Do not assume that spinlocks can be initialized via memcpy:

spin_lock_init(&ptr->lock);

kmem_cache->array[smp_processor_id()] = ptr;

ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);

BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)

!= &initarray_generic.cache);

memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),

sizeof(struct arraycache_init));

* Do not assume that spinlocks can be initialized via memcpy:

spin_lock_init(&ptr->lock);

malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =

ptr;

}

点击(此处)折叠或打开

/* 5) Replace the bootstrap kmem_list3's */

{

int nid;

for_each_online_node(nid) {

init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);

init_list(malloc_sizes[INDEX_AC].cs_cachep,

&initkmem_list3[SIZE_AC + nid], nid);

if (INDEX_AC != INDEX_L3) {

init_list(malloc_sizes[INDEX_L3].cs_cachep,

&initkmem_list3[SIZE_L3 + nid], nid);

}

slab_state = UP;

最后把slab_state状态设置为up 即已经可以正常使用了。虽然上面大部分是代码，具体申请内存的流程前面kmalloc已经讲过了。仅仅是为了弄明白cache到底是个什么玩意，以及如何初始化的。

在kmem_cache_init后，还有一个kmem_cache_init_late函数.

它主要是调用了enable_cpucache和注册一个cpu通知连

点击(此处)折叠或打开

* Register a cpu startup notifier callback that initializes

* cpu_cache_get for all new cpus

register_cpu_notifier(&cpucache_notifier);

还记不记得之前我们分析batchcount的时候的矛盾点？

点击(此处)折叠或打开

/* Called with slab_mutex held always */

static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)

{

int err;

int limit = 0;

int shared = 0;

int batchcount = 0;

if (!is_root_cache(cachep)) {

struct kmem_cache *root = memcg_root_cache(cachep);

limit = root->limit;

shared = root->shared;

batchcount = root->batchcount;

}

if (limit && shared && batchcount)

goto skip_setup;

* The head array serves three purposes:

* - create a LIFO ordering, i.e. return objects that are cache-warm

* - reduce the number of spinlock operations.

* - reduce the number of linked list operations on the slab and

* bufctl chains: array operations are cheaper.

* The numbers are guessed, we should auto-tune as described by

* Bonwick.

if (cachep->size > 131072)

limit = 1;

else if (cachep->size > PAGE_SIZE)

limit = 8;

else if (cachep->size > 1024)

limit = 24;

else if (cachep->size > 256)

limit = 54;

else

limit = 120;

* CPU bound tasks (e.g. network routing) can exhibit cpu bound

* allocation behaviour: Most allocs on one cpu, most free operations

* on another cpu. For these cases, an efficient object passing between

* cpus is necessary. This is provided by a shared array. The array

* replaces Bonwick's magazine layer.

* On uniprocessor, it's functionally equivalent (but less efficient)

* to a larger limit. Thus disabled by default.

shared = 0;

if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)

shared = 8;

#if DEBUG

* With debugging enabled, large batchcount lead to excessively long

* periods with disabled local interrupts. Limit the batchcount

if (limit > 32)

limit = 32;

#endif

batchcount = (limit + 1) / 2;

skip_setup:

err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);

if (err)

printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",

cachep->name, -err);

return err;

}

它会根据obj size 计算limit值，再去计算batchcount的值.

这个只是一个小小的开始吧，内存管理本来就博大精深，只有遇到具体问题具体分析，来加深理解了.

CRH380B3666

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
linux内存管理 cache,linux内存管理之kmem_cache_init

kmem_cache_boot则是：点击(此处)折叠或打开/* internal cache of cache description objs */static struct kmem_cache kmem_cache_boot = {.batchcount = 1,.limit = BOOT_CPUCACHE_ENTRIES, // 默认为 1.shared = 1,.size = siz...
复制链接

扫一扫