memory managerment slab allocator(1)

最新推荐文章于 2020-06-07 16:12:43 发布

shuai_wen

最新推荐文章于 2020-06-07 16:12:43 发布

阅读量939

点赞数

分类专栏： memory

本文链接：https://blog.csdn.net/u011279649/article/details/17582489

版权

memory 专栏收录该内容

26 篇文章 0 订阅

订阅专栏

/***************************************************************************/
* The memory is organized in caches, one cache for each object type.
* (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
* Each cache consists out of many slabs (they are small (usually one
* page long) and always contiguous), and each slab contains multiple
* initialized objects.
*
* This means, that your constructor is used only for newly allocated
* slabs and you must pass objects with the same initializations to
* kmem_cache_free.
*
* Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
* normal). If you need a special memory type, then must create a new
* cache for that memory type.
*
* In order to reduce fragmentation, the slabs are sorted in 3 groups:
*   full slabs with 0 free objects
*   partial slabs
*   empty slabs with no allocated objects[没有分配的 object?]
*
* If partial slabs exist, then new allocations come from these slabs,
* otherwise from empty slabs or new slabs are allocated.
*
* kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
* during kmem_cache_destroy(). The caller must prevent concurrent allocs.
*
* Each cache has a short per-cpu head array, most allocs
* and frees go into that array, and if that array overflows, then 1/2
* of the entries in the array are given back into the global cache.
* The head array is strictly LIFO and should improve the cache hit rates.
* On SMP, it additionally reduces the spinlock operations.
*
* The c_cpuarray may not be read with enabled local interrupts -
* it's changed with a smp_call_function().
*
* SMP synchronization:
* constructors and destructors are called without any locking.
* Several members in struct kmem_cache and struct slab never change, they
*   are accessed without any locking.
* The per-cpu arrays are never accessed from the wrong cpu, no locking,
*     and local interrupts are disabled so slab code is preempt-safe.

* The non-constant members are protected with a per-cache irq spinlock.

Kmem_cache_create

/**
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
* @align: The required alignment for the objects.
* @flags: SLAB flags
* @ctor: A constructor for the objects.
*
* Returns a ptr to the cache on success, NULL on failure.
* Cannot be called within a int, but can be interrupted.
* The @ctor is run when new pages are allocated by the cache.
*
* @name must be valid until the cache is destroyed. This implies that
* the module calling this has to destroy the cache before getting unloaded.
*
* The flags are
*
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
* to catch references to uninitialised memory.
*
* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
* for buffer overruns.
*
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*/
struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
   unsigned long flags, void (*ctor)(void *))
{
   size_t left_over, slab_size, ralign;
   struct kmem_cache *cachep = NULL, *pc;
   gfp_t gfp;

   list_for_each_entry(pc, &cache_chain, next) {
       char tmp;
       int res;
       if (!strcmp(pc->name, name)) {
           printk(KERN_ERR
                   "kmem_cache_create: duplicate cache %s\n", name);
           dump_stack();
           goto oops;
       }
   }

   /*
   * Check that size is in terms of words. This is needed to avoid
   * unaligned accesses for some archs when redzoning is used, and makes
   * sure any on-slab bufctl's are also correctly aligned.
   */
   if (size & (BYTES_PER_WORD - 1)) {
       size += (BYTES_PER_WORD - 1);
       size &= ~(BYTES_PER_WORD - 1);
   }

   ralign = BYTES_PER_WORD;
   align = ralign;
   gfp = GFP_KERNEL;

/* Get cache's description obj.

*kmem_cache结构体本身也由cache管理，且有一个现有蛋还是先用鸡的问题，使用静态分配的方法解决的

**/

   cachep = kmem_cache_zalloc(&cache_cache, gfp);
   if (!cachep)
       goto oops;
   cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];

   /*
   * Determine if the slab management is 'on' or 'off' slab.
   * (bootstrapping cannot cope with offslab caches so don't do
   * it too early on. Always use on-slab management when
   * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
   */
   if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
        !(flags & SLAB_NOLEAKTRACE))
       /*
       * Size is large, assume best to place the slab management obj
       * off-slab (should allow better packing of objs).
       */
       flags |= CFLGS_OFF_SLAB;

   size = ALIGN(size, align);

   left_over = calculate_slab_order(cachep, size, align, flags);
   /*每个slab管理数据的大小，包括两部分：slab结构体和objects的kmem_bufctl_t*/
   slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
              + sizeof(struct slab), align);

   /*有关colour的到底什么用途？*/
   cachep->colour_off = cache_line_size();
   /* Offset must be a multiple of the alignment. */
   if (cachep->colour_off < align)
       cachep->colour_off = align;
   cachep->colour = left_over / cachep->colour_off;
   cachep->slab_size = slab_size;
   cachep->flags = flags;
   cachep->gfpflags = 0;
   if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
       cachep->gfpflags |= GFP_DMA;
   cachep->buffer_size = size;
   cachep->reciprocal_buffer_size = reciprocal_value(size);
   cachep->name = name;

   setup_cpu_cache(cachep, gfp);

   /* cache setup completed, link it into the list */
   list_add(&cachep->next, &cache_chain);
   return cachep;
}

calculate_slab_order

/**已知对象的大小，每个slab到底占据几个page, 每个slab中包含几个object?:是在该函数中决定的
* calculate_slab_order - calculate size (page order) of slabs
* @cachep: pointer to the cache that is being created
* @size: size of objects to be created in this cache.
* @align: required alignment for the objects.
* @flags: slab allocation flags
*
* Also calculates the number of objects per slab.
*
* This could be made much more intelligent. For now, try to avoid using
* high order pages for slabs. When the gfp() functions are more friendly
* towards high-order requests, this should be changed.
*/
static size_t calculate_slab_order(struct kmem_cache *cachep,
           size_t size, size_t align, unsigned long flags)
{
   /* Found something acceptable - save it away */
   cachep->num = num;       /*每个slab中object的个数*/
   cachep->gfporder = gfporder;   /*每个slab使用的page order*/

   /*
   * Acceptable internal fragmentation?：判断的条件
   */
   if (left_over * 8 <= (PAGE_SIZE << gfporder))
       break;
}

setup_cpu_cache

static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
   if (g_cpucache_up == FULL)
       return enable_cpucache(cachep, gfp);
}

/* Called with cache_chain_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
   int err;
   int limit, shared;

   /*
   * The head array serves three purposes:
   * - create a LIFO ordering, i.e. return objects that are cache-warm
   * - reduce the number of spinlock operations.
   * - reduce the number of linked list operations on the slab and
   *   bufctl chains: array operations are cheaper.
   * The numbers are guessed, we should auto-tune as described by
   * Bonwick.
   */
   if (cachep->buffer_size > 131072)
       limit = 1;
   else if (cachep->buffer_size > PAGE_SIZE)
       limit = 8;
   else if (cachep->buffer_size > 1024)
       limit = 24;
   else if (cachep->buffer_size > 256)
       limit = 54;
   else
       limit = 120;

   /*
   * CPU bound tasks (e.g. network routing) can exhibit cpu bound
   * allocation behaviour: Most allocs on one cpu, most free operations
   * on another cpu. For these cases, an efficient object passing between
   * cpus is necessary. This is provided by a shared array. The array
   * replaces Bonwick's magazine layer.
   * On uniprocessor, it's functionally equivalent (but less efficient)
   * to a larger limit. Thus disabled by default.
   */
   shared = 0;
   if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
       shared = 8;

   err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
   if (err)
       printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
               cachep->name, -err);
   return err;
}

do_tune_cpucache

/* Always called with the cache_chain_mutex held */
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
               int batchcount, int shared, gfp_t gfp)
{
   struct ccupdate_struct *new;
   int i;

   new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
              gfp);
   if (!new)
       return -ENOMEM;

   for_each_online_cpu(i) {
       new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
                       batchcount, gfp);
       if (!new->new[i]) {
           for (i--; i >= 0; i--)
               kfree(new->new[i]);
           kfree(new);
           return -ENOMEM;
       }
   }
   new->cachep = cachep;

   on_each_cpu(do_ccupdate_local, (void *)new, 1);

   check_irq_on();
   cachep->batchcount = batchcount;
   cachep->limit = limit;
   cachep->shared = shared;

   for_each_online_cpu(i) {
       struct array_cache *ccold = new->new[i];
       if (!ccold)
           continue;
       spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
       free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
       spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
       kfree(ccold);
   }
   kfree(new);
   return alloc_kmemlist(cachep, gfp);
}

alloc_kmemlist

/* This initializes kmem_list3 or resizes various caches for all nodes.
* 这里的 node指的应该是多个存储系统
*/
static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
{
   int node;
   struct kmem_list3 *l3;
   struct array_cache *new_shared;
   struct array_cache **new_alien = NULL;

   for_each_online_node(node) {

                if (use_alien_caches) {
                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
                        if (!new_alien)
                                goto fail;
                }

       new_shared = NULL;
       if (cachep->shared) {
           new_shared = alloc_arraycache(node,
               cachep->shared*cachep->batchcount,
                   0xbaadf00d, gfp);
           if (!new_shared) {
               free_alien_cache(new_alien);
               goto fail;
           }
       }

       l3 = cachep->nodelists[node];
       if (l3) {
           struct array_cache *shared = l3->shared;

           spin_lock_irq(&l3->list_lock);

           if (shared)
               free_block(cachep, shared->entry,
                       shared->avail, node);

           l3->shared = new_shared;
           if (!l3->alien) {
               l3->alien = new_alien;
               new_alien = NULL;
           }
           l3->free_limit = (1 + nr_cpus_node(node)) *
                   cachep->batchcount + cachep->num;
           spin_unlock_irq(&l3->list_lock);
           kfree(shared);
           free_alien_cache(new_alien);
           continue;
       }
       l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
       if (!l3) {
           free_alien_cache(new_alien);
           kfree(new_shared);
           goto fail;
       }

       kmem_list3_init(l3);
       l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
               ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
       l3->shared = new_shared;
       l3->alien = new_alien;
       l3->free_limit = (1 + nr_cpus_node(node)) *
                   cachep->batchcount + cachep->num;
       cachep->nodelists[node] = l3;
   }
   return 0;
}

执行kmem_cache_create后并没有为object 分配内存，当有请求时才动态分配，如下面的函数 kmem_cache_alloc

kmem_cache_alloc

/**
* kmem_cache_alloc - Allocate an object
* @cachep: The cache to allocate from.
* @flags: See kmalloc().
*
* Allocate an object from this cache. The flags are only relevant
* if the cache has no available objects.
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
   void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));

   trace_kmem_cache_alloc(_RET_IP_, ret,
                   obj_size(cachep), cachep->buffer_size, flags);

   return ret;
}

static __always_inline void *
__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
{
   unsigned long save_flags;
   void *objp;

   flags &= gfp_allowed_mask;

   lockdep_trace_alloc(flags);

   if (slab_should_failslab(cachep, flags))
       return NULL;

   cache_alloc_debugcheck_before(cachep, flags);
   local_irq_save(save_flags);
   objp = __do_cache_alloc(cachep, flags);
   local_irq_restore(save_flags);
   objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
   kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
               flags);
   prefetchw(objp);

   if (likely(objp))
       kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));

   if (unlikely((flags & __GFP_ZERO) && objp))
       memset(objp, 0, obj_size(cachep));

   return objp;
}

static __always_inline void *
__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
   return ____cache_alloc(cachep, flags);
}

____cache_alloc

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)

{
   void *objp;
   struct array_cache *ac;

   check_irq_off();
   /*判断的标准为：array_cache的available成员变量*/
   ac = cpu_cache_get(cachep);
   if (likely(ac->avail)) {
       STATS_INC_ALLOCHIT(cachep);
       ac->touched = 1;
       objp = ac->entry[--ac->avail];
   } else {
       STATS_INC_ALLOCMISS(cachep);
       objp = cache_alloc_refill(cachep, flags);
       /*
       * the 'ac' may be updated by cache_alloc_refill(),
       * and kmemleak_erase() requires its correct value.
       */
       ac = cpu_cache_get(cachep);
   }
   /*
   * To avoid a false negative, if an object that is in one of the
   * per-CPU caches is leaked, we need to make sure kmemleak doesn't
   * treat the array pointers as a reference to the object.
   */
   if (objp)
       kmemleak_erase(&ac->entry[ac->avail]);
   return objp;
}

cache_alloc_refill

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
   int batchcount;
   struct kmem_list3 *l3;
   struct array_cache *ac;
   int node;

retry:
   check_irq_off();
   node = numa_mem_id();
   ac = cpu_cache_get(cachep);
   batchcount = ac->batchcount;
   if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
       /*
       * If there was little recent activity on this cache, then
       * perform only a partial refill. Otherwise we could generate
       * refill bouncing.
       */
       batchcount = BATCHREFILL_LIMIT;
   }
   l3 = cachep->nodelists[node];

   BUG_ON(ac->avail > 0 || !l3);
   spin_lock(&l3->list_lock);

   /* See if we can refill from the shared array
         * transfer entry from the shared array_cache to current CPU array_cache
         **/
   if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
       l3->shared->touched = 1;
       goto alloc_done;
   }
   /*这里有两个循环：外面一层的控制变量是batchcount,从entry得到对应的slab
   *里面的一层控制变量为【slabp->inuse < cachep->num】，最后ac->entry[ac->avail++] = slab_get_obj
   **/
   while (batchcount > 0) {
       struct list_head *entry;
       struct slab *slabp;
       /* Get slab alloc is to come from. */
       entry = l3->slabs_partial.next;
       if (entry == &l3->slabs_partial) {
           l3->free_touched = 1;
           entry = l3->slabs_free.next;
           if (entry == &l3->slabs_free)
               goto must_grow;
       }

       slabp = list_entry(entry, struct slab, list);
       check_slabp(cachep, slabp);
       check_spinlock_acquired(cachep);

       /*
       * The slab was either on partial or free list so
       * there must be at least one object available for
       * allocation.
       */
       BUG_ON(slabp->inuse >= cachep->num);

       while (slabp->inuse < cachep->num && batchcount--) {
           STATS_INC_ALLOCED(cachep);
           STATS_INC_ACTIVE(cachep);
           STATS_SET_HIGH(cachep);

           ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
                                node);
       }
       check_slabp(cachep, slabp);

       /* move slabp to correct slabp list: */
       list_del(&slabp->list);
       if (slabp->free == BUFCTL_END)
           list_add(&slabp->list, &l3->slabs_full);
       else
           list_add(&slabp->list, &l3->slabs_partial);
   }

must_grow:
   l3->free_objects -= ac->avail;
alloc_done:
   spin_unlock(&l3->list_lock);

   if (unlikely(!ac->avail)) {
       int x;
       x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);

       /* cache_grow can reenable interrupts, then ac could change. */
       ac = cpu_cache_get(cachep);
       if (!x && ac->avail == 0)   /* no objects in sight? abort */
           return NULL;

       if (!ac->avail)       /* objects refilled by interrupt? */
           goto retry;
   }
   ac->touched = 1;
   return ac->entry[--ac->avail];
}

static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
               int nodeid)
{
   void *objp = index_to_obj(cachep, slabp, slabp->free);
   kmem_bufctl_t next;

   slabp->inuse++;
   next = slab_bufctl(slabp)[slabp->free];
#if DEBUG
   slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
   WARN_ON(slabp->nodeid != nodeid);
#endif
   slabp->free = next;

   return objp;
}

static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
               unsigned int idx)
{
   return slab->s_mem + cache->buffer_size * idx;
}

cache_grow

/*
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
static int cache_grow(struct kmem_cache *cachep,
       gfp_t flags, int nodeid, void *objp)
{
   struct slab *slabp;
   size_t offset;
   gfp_t local_flags;
   struct kmem_list3 *l3;

   /*
   * Be lazy and only check for valid flags here, keeping it out of the
   * critical path in kmem_cache_alloc().
   */
   BUG_ON(flags & GFP_SLAB_BUG_MASK);
   local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);

   /* Take the l3 list lock to change the colour_next on this node */
   check_irq_off();
   l3 = cachep->nodelists[nodeid];
   spin_lock(&l3->list_lock);

   /* Get colour for the slab, and cal the next value. */
   offset = l3->colour_next;
   l3->colour_next++;
   if (l3->colour_next >= cachep->colour)
       l3->colour_next = 0;
   spin_unlock(&l3->list_lock);

   offset *= cachep->colour_off;

   if (local_flags & __GFP_WAIT)
       local_irq_enable();

   /*
   * The test for missing atomic flag is performed here, rather than
   * the more obvious place, simply to reduce the critical path length
   * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
   * will eventually be caught here (where it matters).
   */
   kmem_flagcheck(cachep, flags);

   /*
   * Get mem for the objs. Attempt to allocate a physical page from
   * 'nodeid'.
   */
   if (!objp)
       objp = kmem_getpages(cachep, local_flags, nodeid);
   if (!objp)
       goto failed;

   /* Get slab management. */
   slabp = alloc_slabmgmt(cachep, objp, offset,
           local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
   if (!slabp)
       goto opps1;

   slab_map_pages(cachep, slabp, objp);

   cache_init_objs(cachep, slabp);

   if (local_flags & __GFP_WAIT)
       local_irq_disable();
   check_irq_off();
   spin_lock(&l3->list_lock);

   /* Make slab active. */
   list_add_tail(&slabp->list, &(l3->slabs_free));
   STATS_INC_GROWN(cachep);
   l3->free_objects += cachep->num;
   spin_unlock(&l3->list_lock);
   return 1;
opps1:
   kmem_freepages(cachep, objp);
failed:
   if (local_flags & __GFP_WAIT)
       local_irq_disable();
   return 0;
}

static void cache_init_objs(struct kmem_cache *cachep,
                struct slab *slabp)
{
   int i;

   for (i = 0; i < cachep->num; i++) {
       void *objp = index_to_obj(cachep, slabp, i);
       slab_bufctl(slabp)[i] = i + 1;
   }
   slab_bufctl(slabp)[i - 1] = BUFCTL_END;
}

主要目的是：能够debug memory被破坏的错误，下一个主题关注调试memory添加的信息。

shuai_wen

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
memory managerment slab allocator(1)

/***************************************************************************/ * The memory is organized in caches, one cache for each object type. * (e.g. inode_cache, dentry_cache, buffer_head, v
复制链接

扫一扫