ceph bluestore缓存中保存的是object的信息,包括元数据和实际数据,元数据是用LRUCache实现的,实际数据是用TwoQCache实现的。
(1)内存管理
bluestore定义了一些命名空间,这些命名空间有自己的变量声明,如下
#define P(x) \
namespace x { \
static const mempool::pool_index_t id = mempool::mempool_##x; \
template<typename v> \
using pool_allocator = mempool::pool_allocator<id,v>; \
\
using string = std::basic_string<char,std::char_traits<char>, \
pool_allocator<char>>; \
\
template<typename k,typename v, typename cmp = std::less<k> > \
using map = std::map<k, v, cmp, \
pool_allocator<std::pair<const k,v>>>; \
\
template<typename k,typename v, typename cmp = std::less<k> > \
using compact_map = compact_map<k, v, cmp, \
pool_allocator<std::pair<const k,v>>>; \
\
template<typename k, typename cmp = std::less<k> > \
using compact_set = compact_set<k, cmp, pool_allocator<k>>; \
\
template<typename k,typename v, typename cmp = std::less<k> > \
using multimap = std::multimap<k,v,cmp, \
pool_allocator<std::pair<const k, \
v>>>; \
\
template<typename k, typename cmp = std::less<k> > \
using set = std::set<k,cmp,pool_allocator<k>>; \
\
template<typename k, typename cmp = std::less<k> > \
using flat_set = boost::container::flat_set<k,cmp,pool_allocator<k>>; \
\
template<typename k, typename v, typename cmp = std::less<k> > \
using flat_map = boost::container::flat_map<k,v,cmp, \
pool_allocator<std::pair<k,v>>>; \
\
template<typename v> \
using list = std::list<v,pool_allocator<v>>; \
\
template<typename v> \
using vector = std::vector<v,pool_allocator<v>>; \
\
template<typename k, typename v, \
typename h=std::hash<k>, \
typename eq = std::equal_to<k>> \
using unordered_map = \
std::unordered_map<k,v,h,eq,pool_allocator<std::pair<const k,v>>>;\
\
inline size_t allocated_bytes() { \
return mempool::get_pool(id).allocated_bytes(); \
} \
inline size_t allocated_items() { \
return mempool::get_pool(id).allocated_items(); \
} \
};
DEFINE_MEMORY_POOLS_HELPER(P)
allocated_items和allocated_bytes返回该pool所分配空间的情况
DEFINE_MEMORY_POOLS_HELPER宏定义如下
#define DEFINE_MEMORY_POOLS_HELPER(f) \
f(bloom_filter) \
f(bluestore_alloc) \
f(bluestore_cache_data) \
f(bluestore_cache_onode) \
f(bluestore_cache_other) \
f(bluestore_fsck) \
f(bluestore_txc) \
f(bluestore_writing_deferred) \
f(bluestore_writing) \
f(bluefs) \
f(buffer_anon) \
f(buffer_meta) \
f(osd) \
f(osd_mapbl) \
f(osd_pglog) \
f(osdmap) \
f(osdmap_mapping) \
f(pgmap) \
f(mds_co) \
f(unittest_1) \
f(unittest_2)
这两部分宏就是定义了一些命名空间,以及和命名空间绑定的行为
(2)重载new 和delete运算符
#define MEMPOOL_DEFINE_OBJECT_FACTORY(obj,factoryname,pool) \
MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool) \
void *obj::operator new(size_t size) { \
return mempool::pool::alloc_##factoryname.allocate(1); \
} \
void obj::operator delete(void *p) { \
return mempool::pool::alloc_##factoryname.deallocate((obj*)p, 1); \
}
在BlueStore.cc中使用该宏如下
// bluestore_cache_onode
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
bluestore_cache_onode);
// bluestore_cache_other
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
bluestore_cache_other);
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
bluestore_cache_other);
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
bluestore_cache_other);
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
bluestore_cache_other);
// bluestore_txc
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
bluestore_txc);
对第一个展开如下
namespace mempool{
namespace bluestore_cache_onode{
pool_allocator<BlueStore::Onode> alloc_bluestore_onode = {true};
}
}
void * BlueStore::Onode::operator new(size_t size) {
return mempool::bluestore_cache_onode::alloc_bluestore_onode.allocate(1);
} \
void BlueStore::Onode::operator delete(void *p) {
return mempool::bluestore_cache_onode::alloc_bluestore_onode.deallocate((BlueStore::Onode*)p, 1);
}
对于bluestore_cache_onode作用域,其pool_allocator是
static const mempool::pool_index_t id = mempool::mempool_bluestore_cache_onode; \
template<typename v> \
using pool_allocator = mempool::pool_allocator<id,v>; \
pool_allocator(bool force_register=false) //true
init(force_register);
pool = &get_pool(pool_ix); //这里pool_ix就是bluestore_cache_onode
static mempool::pool_t table[num_pools];
return table[ix];
if (debug_mode || force_register)
type = pool->get_type(typeid(T), sizeof(T)); //type = pool->get_type(typeid(BlueStore::Onode), sizeof(BlueStore::Onode));
auto p = type_map.find(ti.name()); //一开始没有
if (p != type_map.end())
return &p->second;
type_t &t = type_map[ti.name()];
t.type_name = ti.name();
t.item_size = size;
return &t;
可以看到pool总共有num_pools个,对于bluestore_cache_onode作用域就返回bluestore_cache_onode的pool
重载的new函数如下
return mempool::bluestore_cache_onode::alloc_bluestore_onode.allocate(1);
size_t total = sizeof(T) * n; //BlueStore::Onode
shard_t *shard = pool->pick_a_shard();
size_t me = (size_t)pthread_self();
size_t i = (me >> 3) & ((1 << num_shard_bits) - 1);
return &shard[i]; //在pool_t中定义如下shard_t shard[num_shards];, num_shards = 1 << num_shard_bits num_shard_bits = 5
shard->bytes += total;
shard->items += n;
if (type)
type->items += n;
T* r = reinterpret_cast<T*>(new char[total]); //真正分配内存
return r;
其就是更新pool中shard的记录,然后真正的取申请内存。
因此每次调用new Onode时,都会将申请的内存信息更新到对于pool的shard中。
同理对于其他作用域效果也一样。
(3)onode缓存数据插入
数据插入到缓存是利用_buffer_cache_write函数实现的
_buffer_cache_write(txc, b, b_off, bl, wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);
b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl, flags); //shared_blob的cache是关联的bluestore中的tqcache的
Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl, flags);
b->cache_private = _discard(cache, offset, bl.length());
_add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
buffer_map[b->offset].reset(b);
if (b->is_writing())
b->data.reassign_to_mempool(mempool::mempool_bluestore_writing);
if (writing.empty() || writing.rbegin()->seq <= b->seq)
writing.push_back(*b);
else
else
b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data); //先调整原来的,在更新现在的
cache->_add_buffer(b, level, near);
txc->shared_blobs_written.insert(b->shared_blob);
在_buffer_cache_write函数中会将Buffer插入到writing中,代表这个数据正在被写到磁盘,同时更新mempool_bluestore_writing对应pool的记录,同时将shared_blob插入到shared_blobs_written
因为当前缓存的数据和新的数据可能有重叠的区域,因此要将重叠的部分删除,这就是通过_discard来实现的,如下
auto i = _data_lower_bound(offset); //在buffer_map中找到包含这个offset的buffer
uint32_t end = offset + length;
while (i != buffer_map.end())
Buffer *b = i->second.get();
if (b->offset >= end)
break;
if (b->cache_private > cache_private) //如果任何一段的缓存级别比当前级别高就提升级别
cache_private = b->cache_private;
//第一部分
if (b->offset < offset) //处理第一个重叠的Buffer
int64_t front = offset - b->offset;
if (b->end() > end)
uint32_t tail = b->end() - end;
bl.substr_of(b->data, b->length - tail, tail);
Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
_add_buffer(cache, nb, 0, b); //将新Buffer插入到TwoQCache
if (!b->is_writing())
cache->_adjust_buffer_size(b, front - (int64_t)b->length); //仅仅更新一些数据记录
b->truncate(front); //clear b中原有的数据,从新缓存0-front范围内的数据
break;
else
if (!b->is_writing())
cache->_adjust_buffer_size(b, front - (int64_t)b->length);
b->truncate(front);
++i;
continue;
第二部分
if (b->end() <= end) //中间全部重叠的Buffer
_rm_buffer(cache, i++);
if (p->second->is_writing())
writing.erase(writing.iterator_to(*p->second));
else
cache->_rm_buffer(p->second.get());
buffer_bytes -= b->length;
buffer_list_bytes[b->cache_private] -= b->length;
switch (b->cache_private) {
case BUFFER_WARM_IN:
buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));break;
case BUFFER_WARM_OUT:
buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));break;
case BUFFER_HOT:
buffer_hot.erase(buffer_hot.iterator_to(*b));break;
buffer_map.erase(p);
continue;
uint32_t keep = b->end() - end;
第三部分
if (b->data.length()) //处理最后一个重叠的Buffer
bl.substr_of(b->data, b->length - keep, keep);
Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
_add_buffer(cache, nb, 0, b);
_rm_buffer(cache, i);
(1)第一部分代码就是处理Buffer1的情况,利用truncate函数将原来Buffer1中的数据缩短到只剩前front大小。
(2)第二部分代码就是处理Buffer2的情况,这里直接调用_rm_buffer函数将这个全部覆盖的Buffer删除,_rm_buffer会从对应的TwoQ队列中删除这个Buffer。
(3)第三部分代码就是处理Buffer3的情况,这里先创建一个keep大小的新Buffer,并插入到TwoQCache和buffer_map中,然后将Buffer3删除。
在写请求处理过程中,最后会调用_txc_state_proc,其中在最后的一个STATE_FINISHING状态会处理writing中的缓存,如下:
case TransContext::STATE_FINISHING:
_txc_finish(txc);
for (auto& sb : txc->shared_blobs_written)
sb->bc.finish_write(sb->get_cache(), txc->seq);
auto i = writing.begin();
while (i != writing.end())
if (b->flags & Buffer::FLAG_NOCACHE)
writing.erase(i++);
buffer_map.erase(b->offset);
else
b->state = Buffer::STATE_CLEAN;
writing.erase(i++);
b->maybe_rebuild();
b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
cache->_add_buffer(b, 1, nullptr); //TwoQCache::_add_buffer
if (b->cache_private == BUFFER_NEW) //刚开始是处于NEW状态,插入到buffer_warm_in
b->cache_private = BUFFER_WARM_IN;
if (level > 0)
buffer_warm_in.push_front(*b);
else
buffer_warm_in.push_back(*b);
else
switch (b->cache_private)
case BUFFER_WARM_IN:
buffer_warm_in.push_front(*b); //如果处于WARM_IN,则只在当前队列更新
case BUFFER_WARM_OUT: //如果处于WARM_OUT则要更新到HOT,BUFFER_WARM_OUT只在trim函数中被赋值
b->cache_private = BUFFER_HOT;
case BUFFER_HOT:
buffer_hot.push_front(*b);
对于在_buffer_cache_write中插入的shared_blob,_txc_finish会调用每一个shared_blob的bc的finish_write函数,其就是负责将数据插入到twoqueue缓存队列中。同时更新mempool_bluestore_cache_data对于pool的记录信息。对于新数据会先插入到buffer_warm_in队列,后面如果再次对该数据读写的话,会插入到buffer_hot。
(3)onode元数据插入缓存
void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
{
auto p = onode_lru.iterator_to(*o);
onode_lru.erase(p);
onode_lru.push_front(*o);
}
void _add_onode(OnodeRef& o, int level) override {
if (level > 0)
onode_lru.push_front(*o);
else
onode_lru.push_back(*o);
}
(4)缓存trim
void *BlueStore::MempoolThread::entry()
while (!stop) {
uint64_t meta_bytes =
mempool::bluestore_cache_other::allocated_bytes() +
mempool::bluestore_cache_onode::allocated_bytes();
uint64_t onode_num = mempool::bluestore_cache_onode::allocated_items();
if (onode_num < 2) {
onode_num = 2;
}
float bytes_per_onode = (float)meta_bytes / (float)onode_num;
size_t num_shards = store->cache_shards.size();
float target_ratio = store->cache_meta_ratio + store->cache_data_ratio;
// A little sloppy but should be close enough
uint64_t shard_target = target_ratio * (store->cache_size / num_shards);
for (auto i : store->cache_shards) {
//每一个shard期望的大小,两个比例,目前一个onode所占的字节大小
i->trim(shard_target, store->cache_meta_ratio, store->cache_data_ratio, bytes_per_onode); //
uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
return onode_lru.size(); //onode_lru_list_t onode_lru; 调用get_onode或者touch_onode时都会插入到onode_lru
uint64_t current_buffer = _get_buffer_bytes();
return buffer_bytes; //在_add_buffer函数中更新 这里存放的是对象的数据
uint64_t current = current_meta + current_buffer;
uint64_t target_meta = target_bytes * target_meta_ratio;
uint64_t target_buffer = target_bytes * target_data_ratio;
target_meta = min(target_bytes, target_meta);
target_buffer = min(target_bytes - target_meta, target_buffer);
if (current <= target_bytes)
return;
uint64_t need_to_free = current - target_bytes;
uint64_t free_buffer = 0;
uint64_t free_meta = 0;
if (current_buffer > target_buffer) {
free_buffer = current_buffer - target_buffer;
if (free_buffer > need_to_free) {
free_buffer = need_to_free;
free_meta = need_to_free - free_buffer;
//上面就是计算free的meta和data的大小
//计算开始free的位置,也是需要保留的最大位置
uint64_t max_buffer = current_buffer - free_buffer;
uint64_t max_meta = current_meta - free_meta;
uint64_t max_onodes = max_meta / bytes_per_onode;
_trim(max_onodes, max_buffer);
if (buffer_bytes > buffer_max)
uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio; //0.5
uint64_t khot = buffer_max - kin;
uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size(); //缓存的item的个数
uint64_t buffer_avg_size = buffer_bytes / buffer_num; //每一个item的大小
uint64_t calculated_buffer_num = buffer_max / buffer_avg_size; //计算期望的item个数
kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio; //计算out队列中期望的个数
if (buffer_list_bytes[BUFFER_HOT] < khot) //因为HOT中的缓存很小,则将WARM中删除的数量减小
kin += khot - buffer_list_bytes[BUFFER_HOT];
else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) //因为WARM中的缓存很小,则将HOT中删除的数量减小
khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin; //WARM中期望删除的
while (to_evict_bytes > 0)
auto p = buffer_warm_in.rbegin();
Buffer *b = &*p;
buffer_bytes -= b->length;
buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
to_evict_bytes -= b->length;
evicted += b->length;
b->state = Buffer::STATE_EMPTY;
b->data.clear();
buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
buffer_warm_out.push_front(*b); //可以看到warm_out的不计算在总的缓存数量中
b->cache_private = BUFFER_WARM_OUT;
to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
while (to_evict_bytes > 0)
auto p = buffer_hot.rbegin();
if (p == buffer_hot.rend())
break;
Buffer *b = &*p;
to_evict_bytes -= b->length;
evicted += b->length;
b->space->_rm_buffer(this, b);
_rm_buffer(cache, buffer_map.find(b->offset));
if (p->second->is_writing())
writing.erase(writing.iterator_to(*p->second));
else
cache->_rm_buffer(p->second.get());
buffer_bytes -= b->length;
buffer_list_bytes[b->cache_private] -= b->length;
switch (b->cache_private)
case BUFFER_WARM_IN:
buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));break;
case BUFFER_WARM_OUT:
buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));break;
case BUFFER_HOT:
buffer_hot.erase(buffer_hot.iterator_to(*b));break;
int64_t num = buffer_warm_out.size() - kout;
while (num-- > 0)
Buffer *b = &*buffer_warm_out.rbegin();
b->space->_rm_buffer(this, b);
//从上面可以看出,warm_in中删除的是放入到warm_out,而hot和warm_out删除的是真删除了
int num = onode_lru.size() - onode_max;
int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
auto p = onode_lru.end();
while(num > 0)
Onode *o = &*p;
int refs = o->nref.load();
if (refs > 1)
if (++skipped >= max_skipped)
break;
if (p == onode_lru.begin())
break;
else
p--;
num--;
continue;
if (p != onode_lru.begin())
onode_lru.erase(p--);
else
onode_lru.erase(p);
o->get(); // paranoia
o->c->onode_map.remove(o->oid); //从Collection的OnodeSpace onode_map中删除
o->put();
--num;
store->_update_cache_logger();
utime_t wait;
wait += store->cct->_conf->bluestore_cache_trim_interval;
cond.WaitInterval(lock, wait);
}
stop = false;
return NULL;
(1)计算缓存中onode得个数,这是通过mempool::bluestore_cache_onode::allocated_items()实现的。同时计算元数据的大小这是通过mempool::bluestore_cache_other::allocated_bytes()和mempool::bluestore_cache_onode::allocated_bytes()来实现的。
(2)计算每一个shard的期望大小和每个onode的平均元数据大小。
(3)对于每一个shard调用BlueStore::Cache::trim来计算所要保留的onode的个数和最大缓存数据的大小,然后调用BlueStore::TwoQCache::_trim去做真正的删除。
(4)TwoQCache有三个缓存队列,其中buffer_hot保存热数据,buffer_warm_in保存最新添加的数据,buffer_warm_out保存从buffer_warm_in中淘汰出来的数据。其中buffer_max代表了buffer_warm_in和buffer_hot之和的最大值,bluestore_2q_cache_kin_ratio代表buffer_warm_in所占buffer_max的比例,bluestore_2q_cache_kout_ratio代表了buffer_warm_out所占buffer_max的比例。
(5)从buffer_warm_in队列末尾开始删除多余的数据,删除的数据会直接加入到buffer_warm_out的头部。
(6)从buffer_hot末尾开始删除对于的数据,这里的数据会直接从buffer_hot队列中删除。同理对于buffer_warm_out的删除,其行为是和buffer_hot一样的。
(7)删除onode_lru中多余的onode。这里值得注意的是对于删除的onode缓存,也要从Collection的onode_map中删除。