jemalloc 深入分析之 Tcache 实现原理

最新推荐文章于 2025-02-24 03:08:25 发布

EversChen5

最新推荐文章于 2025-02-24 03:08:25 发布

阅读量2.9k

点赞数

文章标签： jemalloc tcache

本文链接：https://blog.csdn.net/ip5108/article/details/86751023

版权

为了更好的阅读效果，推荐下载pdf文档：
详细文章请参考：《jemalloc 深入分析》
https://github.com/everschen/tools/blob/master/DOC/Jemalloc.pdf
https://download.csdn.net/download/ip5108/10941278

Tcache Tcache Tcache 实现原理

3.1. TSD:thread specific data 线程局部存储 pthread_setspecific(a_name##tsd_tsd, (void *)wrapper))
a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *) \ pthread_getspecific(a_name##tsd_tsd); pthread_key_create(&a_name##tsd_tsd, a_name##tsd_cleanup_wrapper)
a_name##tsd_wrapper_t *wrapper;
wrapper=(a_name##tsd_wrapper_t *) malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));
tcache_get(tsd_t *tsd, bool create)，
会先查找tcache，如果不存在，绑定一个arena，再创建tcache_create(tsd_tsdn(tsd), arena) tcache。
tsd_t *tsd,
typedef struct tsd_s tsd_t;
struct tsd_s {
tsd_state_t state;
#define O(n, t)
t n;
MALLOC_TSD
#undef O
};
#define O(n, t)
t *tsd_##n##p_get(tsd_t *tsd);
t tsd_##n##get(tsd_t *tsd);
void tsd##n##_set(tsd_t *tsd, t n);
MALLOC_TSD
#undef O
这里定义了如下两个函数： tcache_t * tsd_tcache_get(tsd_t *tsd);
void tsd_tcache_set(tsd_t tsd, tcache_t * tcache);
#define MALLOC_TSD
/ O(name, type) */
O(tcache, tcache_t *)
O(thread_allocated, uint64_t)
O(thread_deallocated, uint64_t)
O(prof_tdata, prof_tdata_t *)
O(iarena, arena_t *)
O(arena, arena_t *)
O(arenas_tdata, arena_tdata_t *)
O(narenas_tdata, unsigned)
O(arenas_tdata_bypass, bool)
O(tcache_enabled, tcache_enabled_t)
O(quarantine, quarantine_t *)
O(witnesses, witness_list_t)
O(witness_fork, bool)
#define TSD_INITIALIZER {
tsd_state_uninitialized,
NULL,
0,
0,
NULL,
NULL,
NULL,
NULL,
0,
false,
tcache_enabled_default,
NULL,
ql_head_initializer(witnesses),
false
}
typedef enum {
tsd_state_uninitialized,
tsd_state_nominal,
tsd_state_purgatory,
tsd_state_reincarnated
} tsd_state_t;
static const tsd_t tsd_initializer = TSD_INITIALIZER; malloc_tsd_types(, tsd_t)
malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, , tsd_t, tsd_initializer, tsd_cleanup)
#define malloc_tsd_types(a_name, a_type)
typedef struct {
bool initialized;
a_type val; \ } a_name##tsd_wrapper_t;
#endif
wrapper->val = a_initializer; // TSD_INITIALIZER;

3.2. Tcache和arena的关系
/*

List of tcaches for extant threads associated with this arena.
Stats from these are merged incrementally, and at exit if
opt_stats_print is enabled.
*/
ql_head(tcache_t) tcache_ql;
arena中的每个thread有一个tcache。
(gdb) p *je_arenas@3
$53 = {0x7ce5c00140, 0x7ce5c8fc00, 0x0}
(gdb) p *je_arenas@3
$7 = {0x7145a00140, 0x7145a8fc00, 0x0}
(gdb) p (*je_arenas[0])->tcache_ql
$8 = {qlh_first = 0x714580e000}
(gdb) $9 = {qlh_first = 0x714580e000}
(gdb) p (*je_arenas[0])->nthreads
$10 = {8, 8}
(gdb) p ((tcache_t *)0x714580e000)->link.qre_next
$11 = (tcache_t *) 0x7145811800
(gdb) p ((tcache_t *)0x7145811800)->link.qre_next
$12 = (tcache_t *) 0x7144079800
(gdb) p ((tcache_t *)0x7144079800)->link.qre_next
$13 = (tcache_t *) 0x7144076000
(gdb) p ((tcache_t *)0x7144076000)->link.qre_next
$14 = (tcache_t *) 0x7144188000
(gdb) p ((tcache_t *)0x7144188000)->link.qre_next
$15 = (tcache_t *) 0x714418b800
(gdb) p ((tcache_t *)0x714418b800)->link.qre_next
$16 = (tcache_t *) 0x71441d9000
(gdb) p ((tcache_t *)0x71441d9000)->link.qre_next
$17 = (tcache_t *) 0x71441dc800
(gdb) p ((tcache_t *)0x71441dc800)->link.qre_next $18 = (tcache_t *) 0x714580e000

3.3. Tcache的定义
/*

Number of tcache bins. There are NBINS=36 small-object bins, plus 0 or more
large-object bins.
/
extern unsigned nhbins;//总共有45个bins，small bin用掉36个，还有9个是large用的。
这个值的计算如下，首先android定义了这个值，DANDROID_LG_TCACHE_MAXCLASS_DEFAULT=16，默认是15，android定义的是16，16时，算得的tcache_maxclass=64k，如果15，tcache_maxclass=32k，然后根据这个值来计算nhbins = size2index(tcache_maxclass) + 1=41;得到tcache的bin数量。所以默认有5个large被cache。详细size2index的计算参考size2index的计算过程。
struct tcache_bin_s {
tcache_bin_stats_t tstats;
int low_water; / Min # cached since last GC. /
unsigned lg_fill_div; / Fill (ncached_max >> lg_fill_div). /
unsigned ncached; / # of cached objects. /
/
To make use of adjacent cacheline prefetch, the items in the avail
stack goes to higher address for newer allocations. avail points
just above the available space, which means that
avail[-ncached, … -1] are available items and the lowest item will
be allocated first.
*/
void *avail; / Stack of available objects. /
};
struct tcache_s {
ql_elm(tcache_t) link; / Used for aggregating stats. /
uint64_t prof_accumbytes;/ Cleared after arena_prof_accum(). /
ticker_t gc_ticker; / Drives incremental GC. /
szind_t next_gc_bin; / Next bin to GC. /
tcache_bin_t tbins[1]; / Dynamically sized. /
/
The pointer stacks associated with tbins follow as a contiguous
array. During tcache initialization, the avail pointer in each
element of tbins is initialized to point to the proper offset within
this array.
/
};
3.4. Tcache的结构

avail指向下一个stack的开始位置，也就是stack[ncached_max-1]的下一个元素。所以后续插入的时候用(tbin->avail - nfill + i) = ptr，如果nfill=4，则从avail[-4]，avail[-3]，avail[-2]，avail[-1]顺序放置，-1刚好是第一个位置。
/*
To make use of adjacent cacheline prefetch, the items in the avail
stack goes to higher address for newer allocations. avail points
just above the available space, which means that
avail[-ncached, … -1] are available items and the lowest item will
be allocated first.
/
/
avail points past the available space. Allocations will
access the slots toward higher addresses (for the benefit of
prefetch).
*/
tcache->tbins[i].avail = (void **)((uintptr_t)tcache +(uintptr_t)stack_offset);
avail越过了可用的空间，指到了下一个stack的开始位置，分配是从低往高地址进行，主要是考虑了prefetch的好处。

3.5. Tcache boot与初始化
当前45个tbin的ncached_max值：
(gdb) p *je_tcache_bin_info@45
$7 = { {ncached_max = 8} <repeats 16 times>, {ncached_max = 20}, {
ncached_max = 8}, {ncached_max = 8}, {ncached_max = 8}, { ncached_max = 20}, {ncached_max = 8}, {ncached_max = 20}, { ncached_max = 8}, {ncached_max = 20} <repeats 12 times>, {
ncached_max = 16}, {ncached_max = 16}, {ncached_max = 16}, {
ncached_max = 16}, {ncached_max = 16}, {ncached_max = 16}, {
ncached_max = 16}, {ncached_max = 16}, {ncached_max = 16}}
Tcache的栈元素总共为：stack_nelms=21×8+20×15+16×9=612
Jemalloc 深入分析
Copyright 2013 Spreadtrum Communications Inc. 61
#define SMALL_MAXCLASS ((((size_t)1) << 13) + (((size_t)3) << 11))
SMALL_MAXCLASS=14k=14336
当size区间在SMALL_MAXCLASS=14336<size <= tcache_maxclass=65536，

3.6. Tcache fill过程
填充函数：arena_tcache_fill_small，对于large没有fill的过程。
lg_fill_div初始化是1，在填充cache时，会填tcache_bin_info[binind].ncached_max >> tbin->lg_fill_div个，也就是说填充ncached_max/2的个数。
如果当前bin的runcur有可用的region，直接调用arena_run_reg_alloc分配内存；如果不存在runcur，或者当前run没有可用的region，则调用arena_bin_malloc_hard分配一个新的可用run到runcur，再调用arena_run_reg_alloc进行分配内存，下一次循环的时候就可以直接用arena_run_reg_alloc分配了。
填充的时候从栈顶开始，往下填充，如果填充到一半，没有把整个nfill填满，这个时候需要移动前面的填充信息，确保填充的位置是从栈底开始的。

3.7. Tcache的分配过程
tcache_alloc_small
如果当前cache里有可用的缓存，直接调用tcache_alloc_easy分配，要不调用tcache_alloc_small_hard，先进行填充，再进行缓存分配。
tcache_alloc_easy不用加锁，因为是线程内。如果tcache分配完了，需要arena_tcache_fill_small去填充tcache时，这个时候需要arena lock的加锁保护。
tcache_alloc_easy(tcache_bin_t *tbin, bool *tcache_success)
{
void *ret;
if (unlikely(tbin->ncached == 0)) {
tbin->low_water = -1;
*tcache_success = false;
return (NULL);
}
*tcache_success = true;
ret = *(tbin->avail - tbin->ncached);
tbin->ncached–;
if (unlikely((int)tbin->ncached < tbin->low_water))
tbin->low_water = tbin->ncached;
return (ret);
}
如果tbin->ncached =0，返回tcache_success = false ，会调用tcache_alloc_small_hard先进行填充，然后再进行tcache_alloc_easy分配。
如果tbin->ncached！=0，直接取栈顶元素返回，tbin->ncached–，检查是否需要调整tbin->low_water。

3.8. Tcache的回收flush过程
tcache_bin_flush_small，在GC过程中可能会触发flush操作，还有在释放过程中，如果cache的数量达到了ncached_max值，也需要进行flush回收。
在释放过程中，每次总是挑选栈底region所在的arena的region先进行释放。如果非当前arena的region，则先保存在栈底，在下一个循环中释放。
arena_decay_ticks的作用？这个是用来清理arena层面的arena->ndirty的数量，也是一种回收机制。
(gdb) p (*je_arenas[0])->tcache_ql
$33 = {qlh_first = 0x6f8200e000}
(gdb) p ((tcache_t *)0x6f8200e000)
$34 = (tcache_t *) 0x6f8200e000
(gdb) p *((tcache_t *)0x6f8200e000)
$35 = {link = {qre_next = 0x6f82011800, qre_prev = 0x6f80b85c00},
prof_accumbytes = 0, gc_ticker = {tick = 9, nticks = 228},
next_gc_bin = 14, tbins = {{tstats = {nrequests = 6}, low_water = 0,
lg_fill_div = 1, ncached = 2, avail = 0x6f8200e608}}}
(gdb) p ((tcache_t )0x6f8200e000)->gc_ticker $36 = {tick = 9, nticks = 228}
每一次tcache_dalloc_large/tcache_dalloc_small/tcache_alloc_large/tcache_alloc_small都会调用tcache_event，然后做tick-1动作，直到228个初始值被减完，再恢复初始值228，然后触发tcache_event_hard(tsd, tcache);
Jemalloc 深入分析
Copyright 2013 Spreadtrum Communications Inc. 63
tcache_event->tcache_event_hard
struct ticker_s {
int32_t tick;
int32_t nticks;
};
ticker_t gc_ticker; / Drives incremental GC. /
szind_t next_gc_bin; / Next bin to GC. /
/ Number of tcache allocation/deallocation events between incremental GCs. */
#define TCACHE_GC_INCR
((TCACHE_GC_SWEEP / NBINS) + ((TCACHE_GC_SWEEP / NBINS == 0) ? 0 : 1))
TCACHE_GC_SWEEP=8192，
NBINS=36
TCACHE_GC_INCR=228
tcache_event(tsd_t *tsd, tcache_t tcache)
{
if (TCACHE_GC_INCR == 0)
return;
if (unlikely(ticker_tick(&tcache->gc_ticker)))
tcache_event_hard(tsd, tcache);
}
tcache_event_hard
int low_water; / Min # cached since last GC. /自上次GC以来该bin最低cache的数量。
unsigned lg_fill_div; / Fill (ncached_max >> lg_fill_div). /
unsigned ncached; / # of cached objects. /
1）如果low_water > 0，则保留3/4 low_water的cache。
/

Reduce fill count by 2X. Limit lg_fill_div such that the
fill count is always at least 1.
Jemalloc 深入分析
Copyright 2013 Spreadtrum Communications Inc. 64
/
if ((tbin_info->ncached_max >> (tbin->lg_fill_div+1)) >= 1)
tbin->lg_fill_div++;
2）如果low_water < 0，增加cache填充数为2倍。
/
Increase fill count by 2X. Make sure lg_fill_div stays
greater than 0.
*/
if (tbin->lg_fill_div > 1)
tbin->lg_fill_div–;
设置tbin->low_water = tbin->ncached; （另外只有在当ncached小于low_water时做调整。）所以要使low_water>0，必须是一个GC周期内该bin还是没有分配完。也就是说这个reg_size的bin在当前线程中使用较少。所以会降低缓存数量。
再调整到下一个gc bin，指向tcache->next_gc_bin++;
lg_fill_div初始化为1，所以low_water在第一次tcache_alloc_easy分配的时候被初始化为-1，-1不可能大于任何tbin->ncached，所以直到第一次触发GC，low_water一直为-1，因为lg_fill_div=1，所以第一次触发GC的时候只是修改了low_water值，为当前的tbin->ncached。
当tbin->ncached达到0后，也就是分配完成后，会设置tbin->low_water = -1; 然后才会有可能触发tbin->lg_fill_div–;
所以这里的分配思想是如果在一个周期内使用的少，则只保留3/4的low_water的cache，然后提高lg_fill_div的值，使得下次分配数量减少。相反，如果在一个周期内使用的较多，则会导致ncached达到0，这样low_water会被置为-1，然后减少lg_fill_div的值，使得下次分配的量增加。（lg_fill_div的最小值为1）
如果lg_fill_div最小值为1，那么每次填充只能填最大值ncached_max的一半，这样另一半空间是不是浪费？不浪费，因为可能释放回收，需要空间填充，如果达到ncached_max后，就需要从tcache释放了。
tcaches_create 这个没有调用到。
tcache四个分配和回收函数全部做inline处理，减少函数调用开销。

3.9. Android 对TCACHE_NSLOTS_SMALL_MAX配置问题
对于Android的配置，
#define TCACHE_NSLOTS_SMALL_MIN 20
#define TCACHE_NSLOTS_SMALL_MAX ANDROID_TCACHE_NSLOTS_SMALL_MAX=8
for (i = 0; i < NBINS; i++) {
if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MIN) {
tcache_bin_info[i].ncached_max =
TCACHE_NSLOTS_SMALL_MIN;
} else if ((arena_bin_info[i].nregs << 1) <=
TCACHE_NSLOTS_SMALL_MAX) {
tcache_bin_info[i].ncached_max =
(arena_bin_info[i].nregs << 1);
} else {
tcache_bin_info[i].ncached_max =
TCACHE_NSLOTS_SMALL_MAX;
}
stack_nelms += tcache_bin_info[i].ncached_max;
}
for (; i < nhbins; i++) {
tcache_bin_info[i].ncached_max = TCACHE_NSLOTS_LARGE;
stack_nelms += tcache_bin_info[i].ncached_max;
}
(gdb) p *je_tcache_bin_info@45
$7 = {
{ncached_max = 8} <repeats 16 times>, {ncached_max = 20}, {
ncached_max = 8}, {ncached_max = 8}, {ncached_max = 8}, {
ncached_max = 20}, {ncached_max = 8}, {ncached_max = 20}, {
ncached_max = 8}, {ncached_max = 20} <repeats 12 times>, {
ncached_max = 16}, {ncached_max = 16}, {ncached_max = 16}, {
ncached_max = 16}, {ncached_max = 16}, {ncached_max = 16}, {
Jemalloc 深入分析
Copyright 2013 Spreadtrum Communications Inc. 66
ncached_max = 16}, {ncached_max = 16}, {ncached_max = 16}}
对于Android的配置，TCACHE_NSLOTS_SMALL_MIN> TCACHE_NSLOTS_SMALL_MAX 可能会造成不是期望的值的问题；其实中间的else if ((arena_bin_info[i].nregs << 1) <= TCACHE_NSLOTS_SMALL_MAX)永远得不到执行，因为TCACHE_NSLOTS_SMALL_MAX< TCACHE_NSLOTS_SMALL_MIN。所以造成越是nregs值很大的反而只有8，nregs值比10小的反而ncached_max=20，这个应该是一个问题。
jemalloc的默认TCACHE_NSLOTS_SMALL_MIN=20，TCACHE_NSLOTS_SMALL_MAX=200，如果2nregs<TCACHE_NSLOTS_SMALL_MIN=20, ncached_max=20，如果2nregs< TCACHE_NSLOTS_SMALL_MAX=200，ncached_max=2nregs，要不，ncached_max=200，上限。Jemalloc的设计的目的是对于Tcache的数量和一个run中的2倍region数量建立联系，也就是说如果一个run中的region越多，相应的Tcache的最大的cache数量也相应也大一点。
这样会导致nregs>10时，ncached_max=8，实际分配的cache数量只有4个，因为lg_fill_div的最小值为1，而如果没有android的这个配置文件，这个值是在20-200之间，实际分配的数量是10-100之间，大大降低了缓冲数量，而这些主要集中在reg_size比较小的bin中，8，16, 32, 48, 64, 80, 96, 112,128,160,192,224,256,320,384,448…，这些region在实际分配中一般有大量的分配，所以造成了一定的不合理性。
Android这么配置的目的是？是因为线程很多，而且分配内存可能没有这么高的需求？所以限制了ncached_max值比较小。经过测试吗？