一、内存池
内存池,那真是分析的太多了,几乎这玩意儿是所有的框架必备。哪有计算机框架不用内存的,哪有内存分配就直接扎楞楞的就new,malloc的。先不说是不是看上去让人觉得太LOW,关键是内存确实是很难管理,应用场景的不同,内存池的管理也要有调整。你看有单一大小的,这算是入门版;有多个大小动态适应的,这算高级版;还有动态生成和静态分组管理的,这算是专家版。
总之,就是要让内存分配和管理回收之间达到一个效率与资源的平衡,这在不同场景下可能就有所不同,侧重点有所不一样,所以一切以实际应用为原则。
二、DPDK中的内存池
做为固定大小内存分配的一种管理方式,内存池mempool其实是预应对内存快速的分配和回收的。但是一般情况下,内存的大小是无法精确确定的,所以内存池往往的结果是浪费一小部分内存,这也是一种妥协。在DPDK的内存池中,由三个部分来实现:
1、内存池的节点对象。这些对象存储在全局队列中,可通过唯一标识来访问,当然它只是一个指针结构并不是真正的内存区。
2、内存的实际存储区。它在rte_memzone 中分配出来的连续内存中,用来存储相关的内存池对象。
3、ring无锁队列。无锁队列意味着多线程中是安全的,它可以用来管理mempool的对象。
整个内存池通过环形无锁队列映射进行内存池对象的存取,同时为了兼顾多核冲突而引入了local_cache对象缓冲区,尽量减少多核访问环形队列的并发处理。
三、数据结构和源码
在前面已经把内存池rte_mempool这个数据结构简单分析了一下,这里只给出相关的代码片段:
struct rte_mempool {
char name[RTE_MEMZONE_NAMESIZE];
RTE_STD_C11
union {
void * pool_data;
uint64_t pool_id;
};
void * pool_config;
.......
#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
struct rte_mempool_debug_stats stats[RTE_MAX_LCORE];
#endif
} __rte_cache_aligned;
1、创建
/* create the mempool */
struct rte_mempool *
rte_mempool_create(const char * name, unsigned n, unsigned elt_size,
unsigned cache_size, unsigned private_data_size,
rte_mempool_ctor_t * mp_init, void * mp_init_arg,
rte_mempool_obj_cb_t * obj_init, void * obj_init_arg,
int socket_id, unsigned flags)
{
int ret;
struct rte_mempool * mp;
mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
private_data_size, socket_id, flags);
if (mp == NULL)
return NULL;
/*
* Since we have 4 combinations of the SP/SC/MP/MC examine the flags to
* set the correct index into the table of ops structs.
*/
if ((flags & MEMPOOL_F_SP_PUT) && (flags & MEMPOOL_F_SC_GET))
ret = rte_mempool_set_ops_byname(mp, "ring_sp_sc", NULL);
else if (flags & MEMPOOL_F_SP_PUT)
ret = rte_mempool_set_ops_byname(mp, "ring_sp_mc", NULL);
else if (flags & MEMPOOL_F_SC_GET)
ret = rte_mempool_set_ops_byname(mp, "ring_mp_sc", NULL);
else
ret = rte_mempool_set_ops_byname(mp, "ring_mp_mc", NULL);
if (ret)
goto fail;
/* call the mempool priv initializer */
if (mp_init)
mp_init(mp, mp_init_arg);
if (rte_mempool_populate_default(mp) < 0)
goto fail;
/* call the object initializers * /
if (obj_init)
rte_mempool_obj_iter(mp, obj_init, obj_init_arg);
return mp;
fail:
rte_mempool_free(mp);
return NULL;
}
它调用了rte_mempool_create_empty()这个函数:
/*
* Free a cache. It's the responsibility of the user to make sure that any
* remaining objects in the cache are flushed to the corresponding
* mempool.
*/
void
rte_mempool_cache_free(struct rte_mempool_cache *cache)
{
rte_free(cache);
}
/* create an empty mempool */
struct rte_mempool *
rte_mempool_create_empty(const char *name, unsigned n, unsigned elt_size,
unsigned cache_size, unsigned private_data_size,
int socket_id, unsigned flags)
{
char mz_name[RTE_MEMZONE_NAMESIZE];
struct rte_mempool_list *mempool_list;
struct rte_mempool *mp = NULL;
struct rte_tailq_entry *te = NULL;
const struct rte_memzone *mz = NULL;
size_t mempool_size;
unsigned int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY;
struct rte_mempool_objsz objsz;
unsigned lcore_id;
int ret;
/* compilation-time checks */
RTE_BUILD_BUG_ON((sizeof(struct rte_mempool) &
RTE_CACHE_LINE_MASK) != 0);
RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) &
RTE_CACHE_LINE_MASK) != 0);
#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) &
RTE_CACHE_LINE_MASK) != 0);
RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, stats) &
RTE_CACHE_LINE_MASK) != 0);
#endif
mempool_list = RTE_TAILQ_CAST(rte_mempool_tailq.head, rte_mempool_list);
/* asked for zero items */
if (n == 0) {
rte_errno = EINVAL;
return NULL;
}
/* asked cache too big */
if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE ||
CALC_CACHE_FLUSHTHRESH(cache_size) > n) {
rte_errno = EINVAL;
return NULL;
}
/* "no cache align" imply "no spread" */
if (flags & MEMPOOL_F_NO_CACHE_ALIGN)
flags |= MEMPOOL_F_NO_SPREAD;
/* calculate mempool object sizes. */
if (!rte_mempool_calc_obj_size(elt_size, flags, &objsz)) {
rte_errno = EINVAL;
return NULL;
}
rte_mcfg_mempool_write_lock();
/*
* reserve a memory zone for this mempool: private data is
* cache-aligned
*/
private_data_size = (private_data_size +
RTE_MEMPOOL_ALIGN_MASK) & (~RTE_MEMPOOL_ALIGN_MASK);
/* try to allocate tailq entry */
te = rte_zmalloc("MEMPOOL_TAILQ_ENTRY", sizeof(*te), 0);
if (te == NULL) {
RTE_LOG(ERR, MEMPOOL, "Cannot allocate tailq entry!\n");
goto exit_unlock;
}
mempool_size = MEMPOOL_HEADER_SIZE(mp, cache_size);
mempool_size += private_data_size;
mempool_size = RTE_ALIGN_CEIL(mempool_size, RTE_MEMPOOL_ALIGN);
ret = snprintf(mz_name, sizeof(mz_name), RTE_MEMPOOL_MZ_FORMAT, name);
if (ret < 0 || ret >= (int)sizeof(mz_name)) {
rte_errno = ENAMETOOLONG;
goto exit_unlock;
}
mz = rte_memzone_reserve(mz_name, mempool_size, socket_id, mz_flags);
if (mz == NULL)
goto exit_unlock;
/* init the mempool structure */
mp = mz->addr;
memset(mp, 0, MEMPOOL_HEADER_SIZE(mp, cache_size));
ret = strlcpy(mp->name, name, sizeof(mp->name));
if (ret < 0 || ret >= (int)sizeof(mp->name)) {
rte_errno = ENAMETOOLONG;
goto exit_unlock;
}
mp->mz = mz;
mp->size = n;
mp->flags = flags;
mp->socket_id = socket_id;
mp->elt_size = objsz.elt_size;
mp->header_size = objsz.header_size;
mp->trailer_size = objsz.trailer_size;
/* Size of default caches, zero means disabled. */
mp->cache_size = cache_size;
mp->private_data_size = private_data_size;
STAILQ_INIT(&mp->elt_list);
STAILQ_INIT(&mp->mem_list);
/*
* local_cache pointer is set even if cache_size is zero.
* The local_cache points to just past the elt_pa[] array.
*/
mp->local_cache = (struct rte_mempool_cache *)
RTE_PTR_ADD(mp, MEMPOOL_HEADER_SIZE(mp, 0));
/* Init all default caches. */
if (cache_size != 0) {
for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
mempool_cache_init(&mp->local_cache[lcore_id],
cache_size);
}
te->data = mp;
rte_mcfg_tailq_write_lock();
TAILQ_INSERT_TAIL(mempool_list, te, next);
rte_mcfg_tailq_write_unlock();
rte_mcfg_mempool_write_unlock();
return mp;
exit_unlock:
rte_mcfg_mempool_write_unlock();
rte_free(te);
rte_mempool_free(mp);
return NULL;
}
在前面说过,在内存池的数据结构定义中有三个重要的部分 rte_mempool , rte_mempool_cache 和mempool private,它们都在上面的函数中创建。生成的对象都挂在rte_tailq_elem类型的静态变量rte_mempool_tailq中。
然后通过对三个数据结构体进行计算得到mempool的头大小并得到所有核的Cache的大小。
真正的内存,也就是实际的内存创建是在rte_mempool_populate_default这个函数中创建的。
/* Default function to populate the mempool: allocate memory in memzones,
* and populate them. Return the number of objects added, or a negative
* value on error.
*/
int
rte_mempool_populate_default(struct rte_mempool *mp)
{
unsigned int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY;
char mz_name[RTE_MEMZONE_NAMESIZE];
const struct rte_memzone *mz;
ssize_t mem_size;
size_t align, pg_sz, pg_shift = 0;
rte_iova_t iova;
unsigned mz_id, n;
int ret;
bool need_iova_contig_obj;
size_t max_alloc_size = SIZE_MAX;
ret = mempool_ops_alloc_once(mp);
if (ret != 0)
return ret;
/* mempool must not be populated */
if (mp->nb_mem_chunks != 0)
return -EEXIST;
/*
* the following section calculates page shift and page size values.
*
* these values impact the result of calc_mem_size operation, which
* returns the amount of memory that should be allocated to store the
* desired number of objects. when not zero, it allocates more memory
* for the padding between objects, to ensure that an object does not
* cross a page boundary. in other words, page size/shift are to be set
* to zero if mempool elements won't care about page boundaries.
* there are several considerations for page size and page shift here.
*
* if we don't need our mempools to have physically contiguous objects,
* then just set page shift and page size to 0, because the user has
* indicated that there's no need to care about anything.
*
* if we do need contiguous objects (if a mempool driver has its
* own calc_size() method returning min_chunk_size = mem_size),
* there is also an option to reserve the entire mempool memory
* as one contiguous block of memory.
*
* if we require contiguous objects, but not necessarily the entire
* mempool reserved space to be contiguous, pg_sz will be != 0,
* and the default ops->populate() will take care of not placing
* objects across pages.
*
* if our IO addresses are physical, we may get memory from bigger
* pages, or we might get memory from smaller pages, and how much of it
* we require depends on whether we want bigger or smaller pages.
* However, requesting each and every memory size is too much work, so
* what we'll do instead is walk through the page sizes available, pick
* the smallest one and set up page shift to match that one. We will be
* wasting some space this way, but it's much nicer than looping around
* trying to reserve each and every page size.
*
* If we fail to get enough contiguous memory, then we'll go and
* reserve space in smaller chunks.
*/
need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
ret = rte_mempool_get_page_size(mp, &pg_sz);
if (ret < 0)
return ret;
if (pg_sz != 0)
pg_shift = rte_bsf32(pg_sz);
for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
size_t min_chunk_size;
mem_size = rte_mempool_ops_calc_mem_size(
mp, n, pg_shift, &min_chunk_size, &align);
if (mem_size < 0) {
ret = mem_size;
goto fail;
}
ret = snprintf(mz_name, sizeof(mz_name),
RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
if (ret < 0 || ret >= (int)sizeof(mz_name)) {
ret = -ENAMETOOLONG;
goto fail;
}
/* if we're trying to reserve contiguous memory, add appropriate
* memzone flag.
*/
if (min_chunk_size == (size_t)mem_size)
mz_flags |= RTE_MEMZONE_IOVA_CONTIG;
/* Allocate a memzone, retrying with a smaller area on ENOMEM */
do {
mz = rte_memzone_reserve_aligned(mz_name,
RTE_MIN((size_t)mem_size, max_alloc_size),
mp->socket_id, mz_flags, align);
if (mz != NULL || rte_errno != ENOMEM)
break;
max_alloc_size = RTE_MIN(max_alloc_size,
(size_t)mem_size) / 2;
} while (mz == NULL && max_alloc_size >= min_chunk_size);
if (mz == NULL) {
ret = -rte_errno;
goto fail;
}
if (need_iova_contig_obj)
iova = mz->iova;
else
iova = RTE_BAD_IOVA;
if (pg_sz == 0 || (mz_flags & RTE_MEMZONE_IOVA_CONTIG))
ret = rte_mempool_populate_iova(mp, mz->addr,
iova, mz->len,
rte_mempool_memchunk_mz_free,
(void *)(uintptr_t)mz);
else
ret = rte_mempool_populate_virt(mp, mz->addr,
mz->len, pg_sz,
rte_mempool_memchunk_mz_free,
(void *)(uintptr_t)mz);
if (ret < 0) {
rte_memzone_free(mz);
goto fail;
}
}
return mp->size;
fail:
rte_mempool_free_memchunks(mp);
return ret;
}
其中通过下面的函数来创建Ring:
static int
mempool_ops_alloc_once(struct rte_mempool *mp)
{
int ret;
/* create the internal ring if not already done */
if ((mp->flags & MEMPOOL_F_POOL_CREATED) == 0) {
ret = rte_mempool_ops_alloc(mp);
if (ret != 0)
return ret;
mp->flags |= MEMPOOL_F_POOL_CREATED;
}
return 0;
}
/* wrapper to allocate an external mempool's private (pool) data. */
int
rte_mempool_ops_alloc(struct rte_mempool *mp)
{
struct rte_mempool_ops *ops;
ops = rte_mempool_get_ops(mp->ops_index);
return ops->alloc(mp);
}
分配就得看rte_mempool_ops中的这个alloc在哪里初始化了:
/*
* The following 4 declarations of mempool ops structs address
* the need for the backward compatible mempool handlers for
* single/multi producers and single/multi consumers as dictated by the
* flags provided to the rte_mempool_create function
*/
static const struct rte_mempool_ops ops_mp_mc = {
.name = "ring_mp_mc",
.alloc = common_ring_alloc,
.free = common_ring_free,
.enqueue = common_ring_mp_enqueue,
.dequeue = common_ring_mc_dequeue,
.get_count = common_ring_get_count,
};
MEMPOOL_REGISTER_OPS(ops_mp_mc);
MEMPOOL_REGISTER_OPS(ops_sp_sc);
MEMPOOL_REGISTER_OPS(ops_mp_sc);
MEMPOOL_REGISTER_OPS(ops_sp_mc);
这样基本注册宏的调用就可以明白流程了。这里面嵌套的很深,有兴趣可以逐一跟进。整体上就是rte_ring_create_elem函数创建后插入到全局的rte_ring_tailq,然后再通过rte_mempool_ops_populate函数调用mempool_add_elem函数将申请的实际内存插入到链表中。
2、使用
使用是分配Alloc:
//下面是m_buf的调用
static inline struct rte_mbuf *rte_mbuf_raw_alloc(struct rte_mempool *mp)
{
struct rte_mbuf *m;
if (rte_mempool_get(mp, (void **)&m) < 0)
return NULL;
MBUF_RAW_ALLOC_CHECK(m);
return m;
}
//下面是mempool的接口
static __rte_always_inline int
rte_mempool_get_bulk(struct rte_mempool *mp, void **obj_table, unsigned int n)
{
struct rte_mempool_cache *cache;
cache = rte_mempool_default_cache(mp, rte_lcore_id());
return rte_mempool_generic_get(mp, obj_table, n, cache);
}
static __rte_always_inline int
rte_mempool_get(struct rte_mempool *mp, void **obj_p)
{
return rte_mempool_get_bulk(mp, obj_p, 1);
}
它最终会调用rte_mempool_get_bulk函数来获得内存对象。先从本地的Cache中获取,不够才会从rte_ring中获取mbuf并存储在本地的local_cache中。
3、回收
回收是Free:
static __rte_always_inline void
rte_mbuf_raw_free(struct rte_mbuf *m)
{
RTE_ASSERT(RTE_MBUF_DIRECT(m));
RTE_ASSERT(rte_mbuf_refcnt_read(m) == 1);
RTE_ASSERT(m->next == NULL);
RTE_ASSERT(m->nb_segs == 1);
__rte_mbuf_sanity_check(m, 0);
rte_mempool_put(m->pool, m);
}
static __rte_always_inline void
rte_mempool_put(struct rte_mempool *mp, void *obj)
{
rte_mempool_put_bulk(mp, &obj, 1);
}
static __rte_always_inline void
rte_mempool_put_bulk(struct rte_mempool *mp, void * const *obj_table,
unsigned int n)
{
struct rte_mempool_cache *cache;
cache = rte_mempool_default_cache(mp, rte_lcore_id());
rte_mempool_generic_put(mp, obj_table, n, cache);
}
一路下去就找到结果了。
四、总结
在可预见的将来,计算机的内存管理仍然是一件很重要的事。所以学习一些内存管理的算法和相关的技巧是十分必要的,不可忽视的是,内存池的使用,是用空间换了时间,用浪费一部分内存换取了内存碎片的降低。编程人的眼中,永远是平衡两个字,这个平衡不是简单的平均,而从整体考虑的平衡。在数据层不平衡,在应用反向不平衡,整体就平衡了。好好理解和思考就会明白,内存做为基础的开发应用资源,是多么的重要。