一、item
1、item是存储数据的最小单位。
typedef struct _stritem { //Structure for storing items within memcached.
struct _stritem *next;
struct _stritem *prev;
struct _stritem *h_next; /* hash chain next, 即hash桶中该项的下一项*/
rel_time_t time; /* least recent access */
rel_time_t exptime; /* expire time */
int nbytes; /* size of data */
unsigned short refcount;
uint8_t nsuffix; /* length of flags-and-length string */
uint8_t it_flags; /* ITEM_* above */
uint8_t slabs_clsid;/* which slab class we're in */
uint8_t nkey; /* key length, w/terminating null and padding */
/* this odd type prevents type-punning issues when we do
* the little shuffle to save space when not using CAS. */
union {
uint64_t cas;
char end;
} data[];
/* if it_flags & ITEM_CAS we have 8 bytes CAS */
/* then null-terminated key */
/* then " flags length\r\n" (no terminating null) */
/* then data with terminating \r\n (no terminating null; it's binary!) */
} item;
2、item相关操作
(1) do_item_alloc
/*
* Allocates a new item.
*/
item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes) {
item *it;
/* do_item_alloc handles its own locks */
it = do_item_alloc(key, nkey, flags, exptime, nbytes, 0);
return it;
}
item *do_item_alloc(char *key, const size_t nkey, const int flags,
const rel_time_t exptime, const int nbytes,
const uint32_t cur_hv) { //nkey:the length of key; nbytes: the size of value
uint8_t nsuffix;
item *it = NULL;
char suffix[40];
size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); //ntatal为包含item、key、value等的总长度
if (settings.use_cas) {
ntotal += sizeof(uint64_t);
}
unsigned int id = slabs_clsid(ntotal); //id表示使用第id个slabclass_t中的槽,来存放item。
if (id == 0)
return 0;
mutex_lock(&cache_lock); //Lock for cache operations (item_*, assoc_*)
/* do a quick check if we have any expired items in the tail.. */
int tries = 5;
int tried_alloc = 0;
item *search;
void *hold_lock = NULL;
rel_time_t oldest_live = settings.oldest_live;
search = tails[id];
/* We walk up *only* for locked items. Never searching for expired.
* Waste of CPU for almost all deployments */
for (; tries > 0 && search != NULL; tries--, search=search->prev) { //实际上这里就是LRU的应用,检测tail处的item是否过期
uint32_t hv = hash(ITEM_key(search), search->nkey, 0);
/* Attempt to hash item lock the "search" item. If locked, no
* other callers can incr the refcount
*/
/* FIXME: I think we need to mask the hv here for comparison? */
if (hv != cur_hv && (hold_lock = item_trylock(hv)) == NULL)
continue;
/* Now see if the item is refcount locked */
if (refcount_incr(&search->refcount) != 2) {
refcount_decr(&search->refcount);
/* Old rare bug could cause a refcount leak. We haven't seen
* it in years, but we leave this code in to prevent failures
* just in case */
if (search->time + TAIL_REPAIR_TIME < current_time) { //这里防止bug导致item的reference一直不能降到1,所以超过3小时未访问就进行一次修复
itemstats[id].tailrepairs++; //search->time为least recent access
search->refcount = 1;
do_item_unlink_nolock(search, hv); //将该item删除,其中调用assoc_delete(), item_unlink_q(), do_item_remove();
} //注意,do_item_alloc()一开始就取得了cache_lock锁,该锁用于hash表和双向链表的维护。
if (hold_lock)
item_trylock_unlock(hold_lock);
continue;
}
/* Expired or flushed */
if ((search->exptime != 0 && search->exptime < current_time) //超时
|| (search->time <= oldest_live && oldest_live <= current_time)) { //dead by flash
itemstats[id].reclaimed++;
if ((search->it_flags & ITEM_FETCHED) == 0) {
itemstats[id].expired_unfetched++;
}
it = search;
slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal); //虽然属于同一个slabclass,但长度可能不同,需要调整
do_item_unlink_nolock(it, hv);
/* Initialize the item block: */
it->slabs_clsid = 0;
} else if ((it = slabs_alloc(ntotal, id)) == NULL) {
tried_alloc = 1;
if (settings.evict_to_free == 0) {
itemstats[id].outofmemory++;
} else {
itemstats[id].evicted++;
itemstats[id].evicted_time = current_time - search->time;
if (search->exptime != 0)
itemstats[id].evicted_nonzero++;
if ((search->it_flags & ITEM_FETCHED) == 0) {
itemstats[id].evicted_unfetched++;
}
it = search;
slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);
do_item_unlink_nolock(it, hv);
/* Initialize the item block: */
it->slabs_clsid = 0;
/* If we've just evicted an item, and the automover is set to
* angry bird mode, attempt to rip memory into this slab class.
* TODO: Move valid object detection into a function, and on a
* "successful" memory pull, look behind and see if the next alloc
* would be an eviction. Then kick off the slab mover before the
* eviction happens.
*/
if (settings.slab_automove == 2)
slabs_reassign(-1, id);
}
}
refcount_decr(&search->refcount); //注意此处为原子操作
/* If hash values were equal, we don't grab a second lock */
if (hold_lock)
item_trylock_unlock(hold_lock); //该item已经从hash表中取出,无需持有item锁
break;
}
if (!tried_alloc && (tries == 0 || search == NULL)) //由于这是第一次执行,而且没有设置prealloc,所以需要执行slabs_alloc()
it = slabs_alloc(ntotal, id);
if (it == NULL) {
itemstats[id].outofmemory++;
mutex_unlock(&cache_lock);
return NULL;
}
assert(it->slabs_clsid == 0);
assert(it != heads[id]);
/* Item initialization can happen outside of the lock; the item's already
* been removed from the slab LRU.这可以从下一小节中看出slabs_alloc()返回的item已从LRU中取出
*/
it->refcount = 1; /* the caller will have a reference */
mutex_unlock(&cache_lock); //之后为对it的初始化,与hash表和链表无关,已经无需持有该锁。
it->next = it->prev = it->h_next = 0;
it->slabs_clsid = id;
DEBUG_REFCNT(it, '*');
it->it_flags = settings.use_cas ? ITEM_CAS : 0;
it->nkey = nkey;
it->nbytes = nbytes;
memcpy(ITEM_key(it), key, nkey); //ITEM_key()用于得到item后放置key的指针(item后紧接着就是key)
it->exptime = exptime;
memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix);
it->nsuffix = nsuffix;
return it;
}
注意: cache_lock - Lock for cache operations (item_*, assoc_*),如assoc_insert(), item_link_q(), item_free(),跟哈希表、双向链表都有关。
item_locks - 负责hash表的处理,分为粗粒度和细粒度。
(2) 其他
item_free()
调用 slabs_free ()将其释放到空闲链表中((&slabclass[id])->slots)。
do_item_link()
1.调用 assoc_insert()将 item 指针插入 hash 表中
2.调用 get_cas_id()给 it 的 cas_id 赋值。
3.调用 item_link_q(),把该 item 插入 LRU 队列的最前面 4.调用refcount_incr(&it->refcount)增加引用计数
do_item_unlink ()
1.调用 assoc_delete()在 hash 表中删除此 item
2.调用 item_unlink_q()从该 slab class 去除此 item 的连接,此处考虑了 item 在链表头
部,尾部等各种情况。
3.do_item_remove()减小引用计数,当引用计数为 0 的时候,调用 item_free()将其释放。
do_item_remove ()
减少引用计数 refcount,当发现引用计数为 0 的时候,就调用item_free()将其释放。
do_item_update ()
先调用 item_unlink_q(),更新了时间以后,再调用 item_link_q()。将其重新连接到LRU 队列之中,即让该 item 移到 LRU 队列的最前。
do_item_replace (item* it, item* new_it, const uint32_t hv)
调用 do_item_unlink()解除原有 item 的连接,再调用 do_item_link()连接到新的item。do_item_unlink()和do_item_link()内部是用cache_lock来保证线程安全的,但是这两个函数之间可以执行其他线程(可以不是线程安全的)。
item_get ()
值得说明的是,memcached 的懒惰删除逻辑在这里有体现。就是当你需要 get 一个item 的时候才考虑该 item 是否应该删除。首先调用 do_item_get()函数,根据关键字使用 assoc_find()查找 item,如果没找到,返回 NULL,再判断是否达到最大的生存时间,如果是的话,就do_item_unlink() 该 item,返回 NULL。
如果该 item 没有满足删除条件,将其引用计数加 1,并返回该 item 的指针。
注意:do_item_*和item_free()定义于item.c中;而与item access相关的item_*多数定义于thread.c中,一般由该函数先获得item细粒度锁,内部调用对应的do_item_*();但是也有例外,如item_alloc()由do_item_alloc()自身去处理锁,item_replace()也不需要获得item细粒度锁(本身不需要线程安全)。
二、slab
1、数据结构
/* powers-of-N allocation structures */
typedef struct {
unsigned int size; /* sizes of items 每个小块(chunk,用于放置item)的大小*/
unsigned int perslab; /* how many items per slab */
void *slots; /* list of item ptrs , free items*/
unsigned int sl_curr; /* total free items in list */
unsigned int slabs; /* how many slabs were allocated for this class */
void **slab_list; /* array of slab pointers */
unsigned int list_size; /* size of prev array */
unsigned int killing; /* index+1 of dying slab, or zero if none */
size_t requested; /* The number of requested bytes */
} slabclass_t;
static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
slabclass是由([MAX_NUMBER_OF_SLAB_CLASSES)个slabclass_t结构体构成的数组,每个slabclass_t结构体的size变量是根据增长因子递增的。
/**
* Determines the chunk sizes and initializes the slab class descriptors
* accordingly.
*/
void slabs_init(const size_t limit, const double factor, const bool prealloc) {
int i = POWER_SMALLEST - 1;
unsigned int size = sizeof(item) + settings.chunk_size;
mem_limit = limit;
if (prealloc) {
/* Allocate everything in a big chunk with malloc */
mem_base = malloc(mem_limit);
if (mem_base != NULL) {
mem_current = mem_base;
mem_avail = mem_limit;
} else {
fprintf(stderr, "Warning: Failed to allocate requested memory in"
" one large chunk.\nWill allocate in smaller chunks\n");
}
}
memset(slabclass, 0, sizeof(slabclass));
while (++i < POWER_LARGEST && size <= settings.item_size_max / factor) {
/* Make sure items are always n-byte aligned */
if (size % CHUNK_ALIGN_BYTES)
size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
slabclass[i].size = size;
slabclass[i].perslab = settings.item_size_max / slabclass[i].size;
size *= factor;
if (settings.verbose > 1) {
fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
i, slabclass[i].size, slabclass[i].perslab);
}
}
power_largest = i;
slabclass[power_largest].size = settings.item_size_max;
slabclass[power_largest].perslab = 1;
if (settings.verbose > 1) {
fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
i, slabclass[i].size, slabclass[i].perslab);
}
/* for the test suite: faking of how much we've already malloc'd */
{
char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
if (t_initial_malloc) {
mem_malloced = (size_t)atol(t_initial_malloc);
}
}
if (prealloc) {
slabs_preallocate(power_largest);
}
}
slabclass数组(共201个struct slabclass_t);sl_curr即slots所指双向链表的个数(空闲的),ptr所指的1M内存就是被split为该空闲链表(p->slab_list[p->slabs++] = ptr,详见do_slabs_newslab()):
注意:存在一个head数组(201个元素)和一个tail数组(201个元素)来链接存放着item(非空闲的)的slab,并用以实现LRU。但这只是每个slabclass_t内部的LRU,而非全局,可参考 item_link_q()的实现。
2、slab相关操作
(1) slabs_alloc()和do_slabs_alloc()
void *slabs_alloc(size_t size, unsigned int id) {
void *ret;
pthread_mutex_lock(&slabs_lock);
ret = do_slabs_alloc(size, id);
pthread_mutex_unlock(&slabs_lock);
return ret;
}
static void *do_slabs_alloc(const size_t size, unsigned int id) {
slabclass_t *p;
void *ret = NULL;
item *it = NULL;
if (id < POWER_SMALLEST || id > power_largest) {
MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
return NULL;
}
p = &slabclass[id];
assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
/* fail unless we have space at the end of a recently allocated page,
we have something on our freelist, or we could allocate a new page */
if (! (p->sl_curr != 0 || do_slabs_newslab(id) != 0)) { //当空闲链表尾空时,调用do_slabs_newslab()
/* We don't have more memory available */
ret = NULL;
} else if (p->sl_curr != 0) { //将该slot(用于放置item)从空闲链表中取出。
/* return off our freelist */
it = (item *)p->slots;
p->slots = it->next;
if (it->next) it->next->prev = 0;
p->sl_curr--;
ret = (void *)it;
}
if (ret) {
p->requested += size;
MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
} else {
MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
}
return ret;
}
static int do_slabs_newslab(const unsigned int id) {
slabclass_t *p = &slabclass[id];
int len = settings.slab_reassign ? settings.item_size_max
: p->size * p->perslab;
char *ptr;
if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
(grow_slab_list(id) == 0) ||
((ptr = memory_allocate((size_t)len)) == 0)) { //grow_slab_list()用于防止p->listsize不够用,如果没问题则返回1
//memory_allocate()申请一段内存(如果已经prealloc过了,则直接从中取一段)
MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
return 0;
}
memset(ptr, 0, (size_t)len);
split_slab_page_into_freelist(ptr, id);
p->slab_list[p->slabs++] = ptr;
mem_malloced += len;
MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
return 1;
}
(2)其他
slabs_init()
slab 初始化,如果配置时采用预分配机制(prealloc)则在先在这使用 malloc 分配所有内存。再根据增长因子 factor 给每个 slabclass 分配容量。
slabs_clsid()
计算出哪个 slabclass 适合用来储存大小给定为 size 的 item, 如果返回值为 0 则存储的物件过大,无法进行存储。
do_slabs_free()
首先检查当目前的插糟是否已经达到可用总插糟的总容量,如果达到就为其重新分配空间,再将该回收的 item 的指针插入对应当前 id 的 slabclass 的插糟 (slots) 之中。
do_slabs_stats()
将目前 slab 的状态填充至 buf 缓存中并将其返回。
do_slabs_reassign()
清除在一个 slab class 里的所有的 item,将其移动到另外一个 class 。只有在处理”slab reassign”命令选择手动调整内存分配的时候才会使用,默认是禁止的。
注意:slabs_*和do_slabs_*一般都定义与slabs.c中。一般由slabs_*()来获得slabs_lock锁,内部调用对应的do_slabs_*()。