http://blog.chinaunix.net/uid-20786208-id-4785655.html
点击(此处)折叠或打开
- static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
- bool force_refill)
- {
- int batchcount;
- struct kmem_list3 *l3;
- struct array_cache *ac;
- int node;
- check_irq_off();
- node = numa_mem_id();
- if (unlikely(force_refill))
- goto force_grow;
- retry:
- ac = cpu_cache_get(cachep);
- batchcount = ac->batchcount;
- if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
- /*
- * If there was little recent activity on this cache, then
- * perform only a partial refill. Otherwise we could generate
- * refill bouncing.
- */
- batchcount = BATCHREFILL_LIMIT;
- }
- l3 = cachep->nodelists[node];
- BUG_ON(ac->avail > 0 || !l3);
- spin_lock(&l3->list_lock);
- /* See if we can refill from the shared array */
- if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
- l3->shared->touched = 1;
- goto alloc_done;
- }
- while (batchcount > 0) {
- struct list_head *entry;
- struct slab *slabp;
- /* Get slab alloc is to come from. */
- entry = l3->slabs_partial.next;
- if (entry == &l3->slabs_partial) {
- l3->free_touched = 1;
- entry = l3->slabs_free.next;
- if (entry == &l3->slabs_free)
- goto must_grow;
- }
- slabp = list_entry(entry, struct slab, list);
- check_slabp(cachep, slabp);
- check_spinlock_acquired(cachep);
- /*
- * The slab was either on partial or free list so
- * there must be at least one object available for
- * allocation.
- */
- BUG_ON(slabp->inuse >= cachep->num);
- while (slabp->inuse < cachep->num && batchcount--) {
- STATS_INC_ALLOCED(cachep);
- STATS_INC_ACTIVE(cachep);
- STATS_SET_HIGH(cachep);
- ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
- node));
- }
- check_slabp(cachep, slabp);
- /* move slabp to correct slabp list: */
- list_del(&slabp->list);
- if (slabp->free == BUFCTL_END)
- list_add(&slabp->list, &l3->slabs_full);
- else
- list_add(&slabp->list, &l3->slabs_partial);
- }
- must_grow:
- l3->free_objects -= ac->avail;
- alloc_done:
- spin_unlock(&l3->list_lock);
- if (unlikely(!ac->avail)) {
- int x;
- force_grow:
- x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
- /* cache_grow can reenable interrupts, then ac could change. */
- ac = cpu_cache_get(cachep);
- node = numa_mem_id();
- /* no objects in sight? abort */
- if (!x && (ac->avail == 0 || force_refill))
- return NULL;
- if (!ac->avail) /* objects refilled by interrupt? */
- goto retry;
- }
- ac->touched = 1;
- return ac_get_obj(cachep, ac, flags, force_refill);
- }
点击(此处)折叠或打开
- cpu_cache_get(cachep)->avail = 0;
- cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
- cpu_cache_get(cachep)->batchcount = 1;
- cpu_cache_get(cachep)->touched = 0;
- cachep->batchcount = 1;
- cachep->limit = BOOT_CPUCACHE_ENTRIES;
- return 0;
点击(此处)折叠或打开
- ac = cpu_cache_get(cachep);
- if (likely(ac->avail)) {
- ac->touched = 1;
- objp = ac_get_obj(cachep, ac, flags, false);
- /*
- * Allow for the possibility all avail objects are not allowed
- * by the current flags
- */
- if (objp) {
- STATS_INC_ALLOCHIT(cachep);
- goto out;
- }
- force_refill = true;
- }
后来才发现是自己代码没看全- -, 我们看这样一段代码它在kmem_cache_init初始化后,调用的
点击(此处)折叠或打开
- void __init kmem_cache_init_late(void)
- {
- struct kmem_cache *cachep;
- slab_state = UP;
- /* 6) resize the head arrays to their final sizes */
- mutex_lock(&slab_mutex);
- list_for_each_entry(cachep, &slab_caches, list)
- if (enable_cpucache(cachep, GFP_NOWAIT))
- BUG();
- mutex_unlock(&slab_mutex);
- /* Annotate slab for lockdep -- annotate the malloc caches */
- init_lock_keys();
- /* */
- slab_state = FULL;
- /*
- * Register a cpu startup notifier callback that initializes
- * cpu_cache_get for all new cpus
- */
- register_cpu_notifier(&cpucache_notifier);
- #ifdef CONFIG_NUMA
- /*
- * Register a memory hotplug callback that initializes and frees
- * nodelists.
- */
- hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
- #endif
- /*
- * The reap timers are started later, with a module init call: That part
- * of the kernel is not yet operational.
- */
- }
点击(此处)折叠或打开
- /* Called with slab_mutex held always */
- static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
- {
- int err;
- int limit = 0;
- int shared = 0;
- int batchcount = 0;
- if (!is_root_cache(cachep)) {
- struct kmem_cache *root = memcg_root_cache(cachep);
- limit = root->limit;
- shared = root->shared;
- batchcount = root->batchcount;
- }
- if (limit && shared && batchcount)
- goto skip_setup;
- /*
- * The head array serves three purposes:
- * - create a LIFO ordering, i.e. return objects that are cache-warm
- * - reduce the number of spinlock operations.
- * - reduce the number of linked list operations on the slab and
- * bufctl chains: array operations are cheaper.
- * The numbers are guessed, we should auto-tune as described by
- * Bonwick.
- */
- if (cachep->size > 131072) // size 大一128k 小于page_size 则limit为1
- limit = 1;
- else if (cachep->size > PAGE_SIZE)
- limit = 8;
- else if (cachep->size > 1024)
- limit = 24;
- else if (cachep->size > 256)
- limit = 54;
- else
- limit = 120;
- /*
- * CPU bound tasks (e.g. network routing) can exhibit cpu bound
- * allocation behaviour: Most allocs on one cpu, most free operations
- * on another cpu. For these cases, an efficient object passing between
- * cpus is necessary. This is provided by a shared array. The array
- * replaces Bonwick's magazine layer.
- * On uniprocessor, it's functionally equivalent (but less efficient)
- * to a larger limit. Thus disabled by default.
- */
- shared = 0;
- if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) // smp 下 shared为8 ,单核为0
- shared = 8;
- #if DEBUG
- /*
- * With debugging enabled, large batchcount lead to excessively long
- * periods with disabled local interrupts. Limit the batchcount
- */
- if (limit > 32)
- limit = 32;
- #endif
- batchcount = (limit + 1) / 2;
- skip_setup:
- err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); //设置 参数值到cache里
- if (err)
- printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
- cachep->name, -err);
- return err;
- }
点击(此处)折叠或打开
- static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
- int batchcount, int shared, gfp_t gfp)
- {
- int ret;
- struct kmem_cache *c = NULL;
- int i = 0;
- ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); // 设置传递进来的cache的东西
- if (slab_state < FULL)
- return ret;
- if ((ret < 0) || !is_root_cache(cachep))
- return ret;
- VM_BUG_ON(!mutex_is_locked(&slab_mutex));
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg(cachep, i);
- if (c)
- /* return value determined by the parent cache only */
- __do_tune_cpucache(c, limit, batchcount, shared, gfp);
- }
- return ret;
- }
点击(此处)折叠或打开
- /* Always called with the slab_mutex held */
- static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
- int batchcount, int shared, gfp_t gfp)
- {
- struct ccupdate_struct *new;
- // 说明一下上面的结构体
-
点击(此处)折叠或打开
- struct ccupdate_struct {
- struct kmem_cache *cachep;
- struct array_cache *new[0];
- };
- struct ccupdate_struct {
- int i;
- new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), // 这个函数用完new就释放了。说明它只是起到一个中转的作用.
- gfp);
- if (!new)
- return -ENOMEM;
- for_each_online_cpu(i) {
- new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
- batchcount, gfp);
- if (!new->new[i]) {
- for (i--; i >= 0; i--)
- kfree(new->new[i]);
- kfree(new);
- return -ENOMEM;
- }
- }
- new->cachep = cachep;
- on_each_cpu(do_ccupdate_local, (void *)new, 1); // 关键点: 每个cpu上都调用do_ccupdate_local处理new。
- check_irq_on();
- cachep->batchcount = batchcount;
- cachep->limit = limit;
- cachep->shared = shared;
- for_each_online_cpu(i) {
- struct array_cache *ccold = new->new[i];
- if (!ccold)
- continue;
- spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
- free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); //
- spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
- kfree(ccold);
- }
- kfree(new);
- return alloc_kmemlist(cachep, gfp);
- }
点击(此处)折叠或打开
- static void do_ccupdate_local(void *info)
- {
- struct ccupdate_struct *new = info;
- struct array_cache *old;
- check_irq_off();
- old = cpu_cache_get(new->cachep);
- new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];// 由于之前 new->cache已经指向了我们的cache,所以这里操作的是我们cache的array指向新的地方.
- // 而new->new这个array的初始化是在申请它的时候 见上个函数里的alloc_arraycache:
-
点击(此处)折叠或打开
- static struct array_cache *alloc_arraycache(int node, int entries,
- int batchcount, gfp_t gfp)
- {
- int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
- struct array_cache *nc = NULL;
- nc = kmalloc_node(memsize, gfp, node);
- /*
- * The array_cache structures contain pointers to free object.
- * However, when such objects are allocated or transferred to another
- * cache the pointers are not cleared and they could be counted as
- * valid references during a kmemleak scan. Therefore, kmemleak must
- * not scan such objects.
- */
- kmemleak_no_scan(nc);
- if (nc) {
- nc->avail = 0;
- nc->limit = entries;
- nc->batchcount = batchcount;
- nc->touched = 0;
- spin_lock_init(&nc->lock);
- }
- return nc;
- }
- static struct array_cache *alloc_arraycache(int node, int entries,
- new->new[smp_processor_id()] = old;
- }
我们可以看看实际的内核开启slab的信息:
点击(此处)折叠或打开
- cat /proc/slabinfo
- slabinfo - version: 2.1
- # name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail>
- nf_conntrack_expect 0 0 152 26 1 : tunables 120 60 8 : slabdata 0 0 0
- nf_conntrack_8050c5f0 2 26 296 13 1 : tunables 54 27 8 : slabdata 2 2 0
bridge_fdb_cache 4 78 48 78 1 : tunables 120 60 8 : slabdata 1 1 0
fib6_nodes 12 113 32 113 1 : tunables 120 60 8 : slabdata 1 1 0
ip6_dst_cache 25 57 208 19 1 : tunables 120 60 8 : slabdata 3 3 0
ip6_mrt_cache 0 0 112 35 1 : tunables 120 60 8 : slabdata 0 0 0
RAWv6 8 15 720 5 1 : tunables 54 27 8 : slabdata 3 3 0
UDPLITEv6 0 0 688 11 2 : tunables 54 27 8 : slabdata 0 0 0
UDPv6 3 22 688 11 2 : tunables 54 27 8 : slabdata 2 2 0
tw_sock_TCPv6 0 0 144 27 1 : tunables 120 60 8 : slabdata 0 0 0
request_sock_TCPv6 0 0 112 35 1 : tunables 120 60 8 : slabdata 0 0 0
TCPv6 5 6 1328 3 1 : tunables 24 12 8 : slabdata 2 2 0
ubi_wl_entry_slab 463 580 24 145 1 : tunables 120 60 8 : slabdata 4 4 0
sd_ext_cdb 2 113 32 113 1 : tunables 120 60 8 : slabdata 1 1 0
fuse_request 0 0 384 10 1 : tunables 54 27 8 : slabdata 0 0 0
fuse_inode 0 0 416 9 1 : tunables 54 27 8 : slabdata 0 0 0
jffs2_inode_cache 15 145 24 145 1 : tunables 120 60 8 : slabdata 1 1 0
jffs2_node_frag 130 290 24 145 1 : tunables 120 60 8 : slabdata 2 2 0
uid_cache 0 0 48 78 1 : tunables 120 60 8 : slabdata 0 0 0
UNIX 24 32 480 8 1 : tunables 54 27 8 : slabdata 4 4 0
ip_mrt_cache 0 0 96 40 1 : tunables 120 60 8 : slabdata 0 0 0
UDP-Lite 0 0 560 7 1 : tunables 54 27 8 : slabdata 0 0 0
tcp_bind_bucket 6 113 32 113 1 : tunables 120 60 8 : slabdata 1 1 0
inet_peer_cache 8 24 160 24 1 : tunables 120 60 8 : slabdata 1 1 0
ip_fib_trie 7 113 32 113 1 : tunables 120 60 8 : slabdata 1 1 0
ip_fib_alias 8 145 24 145 1 : tunables 120 60 8 : slabdata 1 1 0
ip_dst_cache 6 27 144 27 1 : tunables 120 60 8 : slabdata 1 1 0
PING 0 0 528 7 1 : tunables 54 27 8 : slabdata 0 0 0
RAW 4 7 544 7 1 : tunables 54 27 8 : slabdata 1 1 0
UDP 13 14 560 7 1 : tunables 54 27 8 : slabdata 2 2 0
tw_sock_TCP 0 0 112 35 1 : tunables 120 60 8 : slabdata 0 0 0
request_sock_TCP 0 0 80 48 1 : tunables 120 60 8 : slabdata 0 0 0
TCP 1 6 1184 6 2 : tunables 24 12 8 : slabdata 1 1 0
- ......
-
size-2048(DMA) 0 0 2048 2 1 : tunables 24 12 8 : slabdata 0 0 0
size-2048 192 192 2048 2 1 : tunables 24 12 8 : slabdata 96 96 0
size-1024(DMA) 0 0 1024 4 1 : tunables 54 27 8 : slabdata 0 0 0
size-1024 215 216 1024 4 1 : tunables 54 27 8 : slabdata 54 54 0
size-512(DMA) 0 0 512 8 1 : tunables 54 27 8 : slabdata 0 0 0
size-512 601 624 512 8 1 : tunables 54 27 8 : slabdata 78 78 0
size-256(DMA) 0 0 256 15 1 : tunables 120 60 8 : slabdata 0 0 0
size-256 1234 1245 256 15 1 : tunables 120 60 8 : slabdata 83 83 0
size-192(DMA) 0 0 256 15 1 : tunables 120 60 8 : slabdata 0 0 0
size-192 287 300 256 15 1 : tunables 120 60 8 : slabdata 20 20 0
size-128(DMA) 0 0 128 30 1 : tunables 120 60 8 : slabdata 0 0 0
size-128 1890 1890 128 30 1 : tunables 120 60 8 : slabdata 63 63 0
size-96(DMA) 0 0 128 30 1 : tunables 120 60 8 : slabdata 0 0 0
size-96 930 930 128 30 1 : tunables 120 60 8 : slabdata 31 31 0
size-64(DMA) 0 0 128 30 1 : tunables 120 60 8 : slabdata 0 0 0
size-32(DMA) 0 0 128 30 1 : tunables 120 60 8 : slabdata 0 0 0
size-64 1577 1650 128 30 1 : tunables 120 60 8 : slabdata 55 55 0
size-32 6213 6300 128 30 1 : tunables 120 60 8 : slabdata 210 210 0
kmem_cache 150 160 96 40 1 : tunables 120 60 8 : slabdata 4 4 0
点击(此处)折叠或打开
- void __init kmem_cache_init_late(void)
- {
- }
slab是slub和slob的基础。
SLOB的目标是针对嵌入式系统的,主要是适用于那些内存非常有限的系统,比如32MB以下的内存,它不太注重large smp系统,虽然最近在这方面有一些小的改进
SLUB allocator,用于替代 slab 代码 。通过取消了大量的队列和相关开销、简化 slab 的结构,SLUB 承诺提供更好的性能和更好的系统可伸缩性,并且可以同时保持现有的 slab 分配器接口
说了这么多,我们用个图来简单描述下slab机制: