ceph BlueStore缓存解析

最新推荐文章于 2024-05-01 07:00:00 发布

帮我起个网名

最新推荐文章于 2024-05-01 07:00:00 发布

阅读量2.6k

点赞数

分类专栏： ceph 文章标签： ceph

本文链接：https://blog.csdn.net/u014104588/article/details/88807523

版权

ceph 专栏收录该内容

23 篇文章 16 订阅

订阅专栏

ceph bluestore缓存中保存的是object的信息，包括元数据和实际数据，元数据是用LRUCache实现的，实际数据是用TwoQCache实现的。

（1）内存管理
bluestore定义了一些命名空间，这些命名空间有自己的变量声明，如下

#define P(x)								\
  namespace x {								\
    static const mempool::pool_index_t id = mempool::mempool_##x;	\
    template<typename v>						\
    using pool_allocator = mempool::pool_allocator<id,v>;		\
                                                                        \
    using string = std::basic_string<char,std::char_traits<char>,       \
                                     pool_allocator<char>>;             \
                                                                        \
    template<typename k,typename v, typename cmp = std::less<k> >	\
    using map = std::map<k, v, cmp,					\
			 pool_allocator<std::pair<const k,v>>>;		\
                                                                        \
    template<typename k,typename v, typename cmp = std::less<k> >       \
    using compact_map = compact_map<k, v, cmp,                          \
			 pool_allocator<std::pair<const k,v>>>;         \
                                                                        \
    template<typename k, typename cmp = std::less<k> >                  \
    using compact_set = compact_set<k, cmp, pool_allocator<k>>;         \
                                                                        \
    template<typename k,typename v, typename cmp = std::less<k> >	\
    using multimap = std::multimap<k,v,cmp,				\
				   pool_allocator<std::pair<const k,	\
							    v>>>;	\
                                                                        \
    template<typename k, typename cmp = std::less<k> >			\
    using set = std::set<k,cmp,pool_allocator<k>>;			\
                                                                        \
    template<typename k, typename cmp = std::less<k> >			\
    using flat_set = boost::container::flat_set<k,cmp,pool_allocator<k>>; \
									\
    template<typename k, typename v, typename cmp = std::less<k> >	\
    using flat_map = boost::container::flat_map<k,v,cmp,		\
						pool_allocator<std::pair<k,v>>>; \
                                                                        \
    template<typename v>						\
    using list = std::list<v,pool_allocator<v>>;			\
                                                                        \
    template<typename v>						\
    using vector = std::vector<v,pool_allocator<v>>;			\
                                                                        \
    template<typename k, typename v,					\
	     typename h=std::hash<k>,					\
	     typename eq = std::equal_to<k>>				\
    using unordered_map =						\
      std::unordered_map<k,v,h,eq,pool_allocator<std::pair<const k,v>>>;\
                                                                        \
    inline size_t allocated_bytes() {					\
      return mempool::get_pool(id).allocated_bytes();			\
    }									\
    inline size_t allocated_items() {					\
      return mempool::get_pool(id).allocated_items();			\
    }									\
  };

DEFINE_MEMORY_POOLS_HELPER(P)

allocated_items和allocated_bytes返回该pool所分配空间的情况

DEFINE_MEMORY_POOLS_HELPER宏定义如下

#define DEFINE_MEMORY_POOLS_HELPER(f) \
  f(bloom_filter)		      \
  f(bluestore_alloc)		      \
  f(bluestore_cache_data)	      \
  f(bluestore_cache_onode)	      \
  f(bluestore_cache_other)	      \
  f(bluestore_fsck)		      \
  f(bluestore_txc)		      \
  f(bluestore_writing_deferred)	      \
  f(bluestore_writing)		      \
  f(bluefs)			      \
  f(buffer_anon)		      \
  f(buffer_meta)		      \
  f(osd)			      \
  f(osd_mapbl)			      \
  f(osd_pglog)			      \
  f(osdmap)			      \
  f(osdmap_mapping)		      \
  f(pgmap)			      \
  f(mds_co)			      \
  f(unittest_1)			      \
  f(unittest_2)

这两部分宏就是定义了一些命名空间，以及和命名空间绑定的行为

（2）重载new 和delete运算符

#define MEMPOOL_DEFINE_OBJECT_FACTORY(obj,factoryname,pool)		\
  MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool)			\
  void *obj::operator new(size_t size) {				\
    return mempool::pool::alloc_##factoryname.allocate(1); \
  }									\
  void obj::operator delete(void *p)  {					\
    return mempool::pool::alloc_##factoryname.deallocate((obj*)p, 1);	\
  }

在BlueStore.cc中使用该宏如下

// bluestore_cache_onode
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode,
			      bluestore_cache_onode);

// bluestore_cache_other
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Buffer, bluestore_buffer,
			      bluestore_cache_other);
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Extent, bluestore_extent,
			      bluestore_cache_other);
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Blob, bluestore_blob,
			      bluestore_cache_other);
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::SharedBlob, bluestore_shared_blob,
			      bluestore_cache_other);

// bluestore_txc
MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::TransContext, bluestore_transcontext,
			      bluestore_txc);

对第一个展开如下

namespace mempool{
    namespace bluestore_cache_onode{  
        pool_allocator<BlueStore::Onode> alloc_bluestore_onode = {true}; 
    }
}
void * BlueStore::Onode::operator new(size_t size) {		
  return mempool::bluestore_cache_onode::alloc_bluestore_onode.allocate(1);
}									\
void BlueStore::Onode::operator delete(void *p)  {					
  return mempool::bluestore_cache_onode::alloc_bluestore_onode.deallocate((BlueStore::Onode*)p, 1);	
}

对于bluestore_cache_onode作用域，其pool_allocator是

    static const mempool::pool_index_t id = mempool::mempool_bluestore_cache_onode;	\
    template<typename v>						\
    using pool_allocator = mempool::pool_allocator<id,v>;		\

pool_allocator(bool force_register=false)  //true
    init(force_register);
        pool = &get_pool(pool_ix); //这里pool_ix就是bluestore_cache_onode
            static mempool::pool_t table[num_pools];
            return table[ix];
        if (debug_mode || force_register)
            type = pool->get_type(typeid(T), sizeof(T)); //type = pool->get_type(typeid(BlueStore::Onode), sizeof(BlueStore::Onode));
                auto p = type_map.find(ti.name()); //一开始没有
                if (p != type_map.end()) 
                    return &p->second;
                type_t &t = type_map[ti.name()];
                t.type_name = ti.name();
                t.item_size = size;
                return &t;

可以看到pool总共有num_pools个，对于bluestore_cache_onode作用域就返回bluestore_cache_onode的pool

重载的new函数如下

return mempool::bluestore_cache_onode::alloc_bluestore_onode.allocate(1); 
    size_t total = sizeof(T) * n; //BlueStore::Onode
    shard_t *shard = pool->pick_a_shard();
        size_t me = (size_t)pthread_self();
        size_t i = (me >> 3) & ((1 << num_shard_bits) - 1);
        return &shard[i]; //在pool_t中定义如下shard_t shard[num_shards];， num_shards = 1 << num_shard_bits  num_shard_bits = 5
    shard->bytes += total;
    shard->items += n;
    if (type)
        type->items += n;
    T* r = reinterpret_cast<T*>(new char[total]); //真正分配内存  
    return r;

其就是更新pool中shard的记录，然后真正的取申请内存。
因此每次调用new Onode时，都会将申请的内存信息更新到对于pool的shard中。
同理对于其他作用域效果也一样。

（3）onode缓存数据插入
数据插入到缓存是利用_buffer_cache_write函数实现的

_buffer_cache_write(txc, b, b_off, bl, wctx->buffered ? 0 : Buffer::FLAG_NOCACHE);   
    b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl, flags);  //shared_blob的cache是关联的bluestore中的tqcache的
        Buffer *b = new Buffer(this, Buffer::STATE_WRITING, seq, offset, bl, flags);
        b->cache_private = _discard(cache, offset, bl.length());
        _add_buffer(cache, b, (flags & Buffer::FLAG_NOCACHE) ? 0 : 1, nullptr);
            buffer_map[b->offset].reset(b);
            if (b->is_writing())
                b->data.reassign_to_mempool(mempool::mempool_bluestore_writing);
                if (writing.empty() || writing.rbegin()->seq <= b->seq)
                   writing.push_back(*b);
                else 
            else
                b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);  //先调整原来的，在更新现在的
                cache->_add_buffer(b, level, near);  
        
    txc->shared_blobs_written.insert(b->shared_blob);

在_buffer_cache_write函数中会将Buffer插入到writing中，代表这个数据正在被写到磁盘，同时更新mempool_bluestore_writing对应pool的记录，同时将shared_blob插入到shared_blobs_written

因为当前缓存的数据和新的数据可能有重叠的区域，因此要将重叠的部分删除，这就是通过_discard来实现的，如下

auto i = _data_lower_bound(offset);  //在buffer_map中找到包含这个offset的buffer
uint32_t end = offset + length;
while (i != buffer_map.end())
    Buffer *b = i->second.get(); 
    if (b->offset >= end)
        break;
    if (b->cache_private > cache_private)  //如果任何一段的缓存级别比当前级别高就提升级别
        cache_private = b->cache_private; 
//第一部分
    if (b->offset < offset) //处理第一个重叠的Buffer
        int64_t front = offset - b->offset;
        if (b->end() > end)
            uint32_t tail = b->end() - end;
            bl.substr_of(b->data, b->length - tail, tail);
            Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
            _add_buffer(cache, nb, 0, b); //将新Buffer插入到TwoQCache
            if (!b->is_writing()) 
                cache->_adjust_buffer_size(b, front - (int64_t)b->length); //仅仅更新一些数据记录
            b->truncate(front); //clear b中原有的数据，从新缓存0-front范围内的数据
            break;
        else   
            if (!b->is_writing()) 
                cache->_adjust_buffer_size(b, front - (int64_t)b->length);    
            b->truncate(front);
            ++i;
            continue;
    第二部分
    if (b->end() <= end) //中间全部重叠的Buffer
        _rm_buffer(cache, i++);
            if (p->second->is_writing())
                writing.erase(writing.iterator_to(*p->second));
            else
                cache->_rm_buffer(p->second.get());
                    buffer_bytes -= b->length;
                    buffer_list_bytes[b->cache_private] -= b->length;
                    switch (b->cache_private) {
                      case BUFFER_WARM_IN:
                          buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));break;
                      case BUFFER_WARM_OUT:
                          buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));break;
                      case BUFFER_HOT:
                          buffer_hot.erase(buffer_hot.iterator_to(*b));break;                                
                buffer_map.erase(p);
        continue;
    uint32_t keep = b->end() - end;
    第三部分
    if (b->data.length())  //处理最后一个重叠的Buffer
        bl.substr_of(b->data, b->length - keep, keep);
        Buffer *nb = new Buffer(this, b->state, b->seq, end, bl);
        _add_buffer(cache, nb, 0, b);
    _rm_buffer(cache, i);

在这里插入图片描述
（1）第一部分代码就是处理Buffer1的情况，利用truncate函数将原来Buffer1中的数据缩短到只剩前front大小。
（2）第二部分代码就是处理Buffer2的情况，这里直接调用_rm_buffer函数将这个全部覆盖的Buffer删除，_rm_buffer会从对应的TwoQ队列中删除这个Buffer。
（3）第三部分代码就是处理Buffer3的情况，这里先创建一个keep大小的新Buffer，并插入到TwoQCache和buffer_map中，然后将Buffer3删除。

在写请求处理过程中，最后会调用_txc_state_proc，其中在最后的一个STATE_FINISHING状态会处理writing中的缓存，如下：

case TransContext::STATE_FINISHING:
    _txc_finish(txc);
    for (auto& sb : txc->shared_blobs_written)
        sb->bc.finish_write(sb->get_cache(), txc->seq);
            auto i = writing.begin();
            while (i != writing.end())
                if (b->flags & Buffer::FLAG_NOCACHE)
                    writing.erase(i++);
                    buffer_map.erase(b->offset);
                else
                    b->state = Buffer::STATE_CLEAN;
                    writing.erase(i++);
                    b->maybe_rebuild();
                    b->data.reassign_to_mempool(mempool::mempool_bluestore_cache_data);
                    cache->_add_buffer(b, 1, nullptr);  //TwoQCache::_add_buffer
                        if (b->cache_private == BUFFER_NEW) //刚开始是处于NEW状态，插入到buffer_warm_in
                            b->cache_private = BUFFER_WARM_IN;
                            if (level > 0) 
                                buffer_warm_in.push_front(*b);
                            else
                                buffer_warm_in.push_back(*b);
                        else
                            switch (b->cache_private)
                                case BUFFER_WARM_IN:
                                    buffer_warm_in.push_front(*b); //如果处于WARM_IN，则只在当前队列更新
                                case BUFFER_WARM_OUT:  //如果处于WARM_OUT则要更新到HOT，BUFFER_WARM_OUT只在trim函数中被赋值
                                    b->cache_private = BUFFER_HOT;
                                case BUFFER_HOT:
                                    buffer_hot.push_front(*b);

对于在_buffer_cache_write中插入的shared_blob，_txc_finish会调用每一个shared_blob的bc的finish_write函数，其就是负责将数据插入到twoqueue缓存队列中。同时更新mempool_bluestore_cache_data对于pool的记录信息。对于新数据会先插入到buffer_warm_in队列，后面如果再次对该数据读写的话，会插入到buffer_hot。

（3）onode元数据插入缓存

void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
{
    auto p = onode_lru.iterator_to(*o);
    onode_lru.erase(p);
    onode_lru.push_front(*o);
}

void _add_onode(OnodeRef& o, int level) override {
    if (level > 0)
        onode_lru.push_front(*o);
    else
        onode_lru.push_back(*o);
}

（4）缓存trim

void *BlueStore::MempoolThread::entry() 
while (!stop) {
    uint64_t meta_bytes =
        mempool::bluestore_cache_other::allocated_bytes() +
        mempool::bluestore_cache_onode::allocated_bytes();
    uint64_t onode_num = mempool::bluestore_cache_onode::allocated_items();
    if (onode_num < 2) {
      onode_num = 2;
    }

  float bytes_per_onode = (float)meta_bytes / (float)onode_num;
  size_t num_shards = store->cache_shards.size();
  float target_ratio = store->cache_meta_ratio + store->cache_data_ratio;
  // A little sloppy but should be close enough
  uint64_t shard_target = target_ratio * (store->cache_size / num_shards);  

  for (auto i : store->cache_shards) {
    //每一个shard期望的大小，两个比例，目前一个onode所占的字节大小
    i->trim(shard_target, store->cache_meta_ratio, store->cache_data_ratio, bytes_per_onode); //
        uint64_t current_meta = _get_num_onodes() * bytes_per_onode;    
            return onode_lru.size();  //onode_lru_list_t onode_lru;   调用get_onode或者touch_onode时都会插入到onode_lru
        uint64_t current_buffer = _get_buffer_bytes();  
            return buffer_bytes;     //在_add_buffer函数中更新  这里存放的是对象的数据
        uint64_t current = current_meta + current_buffer;
        uint64_t target_meta = target_bytes * target_meta_ratio;
        uint64_t target_buffer = target_bytes * target_data_ratio;
        
        target_meta = min(target_bytes, target_meta);
        target_buffer = min(target_bytes - target_meta, target_buffer);
        if (current <= target_bytes)
            return;
        
        uint64_t need_to_free = current - target_bytes;
        uint64_t free_buffer = 0;
        uint64_t free_meta = 0;
        if (current_buffer > target_buffer) {
          free_buffer = current_buffer - target_buffer;
          if (free_buffer > need_to_free) {
            free_buffer = need_to_free;
        free_meta = need_to_free - free_buffer;
        //上面就是计算free的meta和data的大小
        
        //计算开始free的位置，也是需要保留的最大位置
        uint64_t max_buffer = current_buffer - free_buffer;
        uint64_t max_meta = current_meta - free_meta;
        uint64_t max_onodes = max_meta / bytes_per_onode;  
        _trim(max_onodes, max_buffer);
            if (buffer_bytes > buffer_max)
                uint64_t kin = buffer_max * cct->_conf->bluestore_2q_cache_kin_ratio; //0.5
                uint64_t khot = buffer_max - kin;

                uint64_t buffer_num = buffer_hot.size() + buffer_warm_in.size(); //缓存的item的个数
                uint64_t buffer_avg_size = buffer_bytes / buffer_num; //每一个item的大小
                uint64_t calculated_buffer_num = buffer_max / buffer_avg_size; //计算期望的item个数
                kout = calculated_buffer_num * cct->_conf->bluestore_2q_cache_kout_ratio;  //计算out队列中期望的个数
                 
                if (buffer_list_bytes[BUFFER_HOT] < khot) //因为HOT中的缓存很小，则将WARM中删除的数量减小
                    kin += khot - buffer_list_bytes[BUFFER_HOT];
                else if (buffer_list_bytes[BUFFER_WARM_IN] < kin) //因为WARM中的缓存很小，则将HOT中删除的数量减小
                    khot += kin - buffer_list_bytes[BUFFER_WARM_IN];
                    
                int64_t to_evict_bytes = buffer_list_bytes[BUFFER_WARM_IN] - kin;  //WARM中期望删除的   
                while (to_evict_bytes > 0)
                    auto p = buffer_warm_in.rbegin();
                    Buffer *b = &*p;
                    buffer_bytes -= b->length;
                    buffer_list_bytes[BUFFER_WARM_IN] -= b->length;
                    to_evict_bytes -= b->length;
                    evicted += b->length;
                    b->state = Buffer::STATE_EMPTY;
                    b->data.clear();
                    buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));
                    buffer_warm_out.push_front(*b); //可以看到warm_out的不计算在总的缓存数量中
                    b->cache_private = BUFFER_WARM_OUT;                    
                
                to_evict_bytes = buffer_list_bytes[BUFFER_HOT] - khot;
                while (to_evict_bytes > 0)
                    auto p = buffer_hot.rbegin();
                    if (p == buffer_hot.rend()) 
                        break;
                    Buffer *b = &*p;
                    to_evict_bytes -= b->length;
                    evicted += b->length;
                    b->space->_rm_buffer(this, b); 
                        _rm_buffer(cache, buffer_map.find(b->offset));
                            if (p->second->is_writing())
                                writing.erase(writing.iterator_to(*p->second));
                            else
                                cache->_rm_buffer(p->second.get());
                                    buffer_bytes -= b->length;
                                    buffer_list_bytes[b->cache_private] -= b->length;
                                    switch (b->cache_private)
                                        case BUFFER_WARM_IN:
                                            buffer_warm_in.erase(buffer_warm_in.iterator_to(*b));break;
                                        case BUFFER_WARM_OUT:
                                            buffer_warm_out.erase(buffer_warm_out.iterator_to(*b));break;
                                        case BUFFER_HOT:
                                            buffer_hot.erase(buffer_hot.iterator_to(*b));break;                
                int64_t num = buffer_warm_out.size() - kout;
                while (num-- > 0)
                    Buffer *b = &*buffer_warm_out.rbegin();
                    b->space->_rm_buffer(this, b);
                //从上面可以看出，warm_in中删除的是放入到warm_out，而hot和warm_out删除的是真删除了
                
                int num = onode_lru.size() - onode_max;
                int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
                auto p = onode_lru.end();
                while(num > 0)
                    Onode *o = &*p;
                    int refs = o->nref.load();
                    if (refs > 1)
                        if (++skipped >= max_skipped)
                            break;
                        if (p == onode_lru.begin()) 
                            break;
                        else
                            p--;
                            num--;
                            continue;
                    if (p != onode_lru.begin())  
                        onode_lru.erase(p--);
                    else
                        onode_lru.erase(p);
                    
                    o->get();  // paranoia
                    o->c->onode_map.remove(o->oid); //从Collection的OnodeSpace onode_map中删除
                    o->put();
                    --num;
            
  store->_update_cache_logger();

  utime_t wait;
  wait += store->cct->_conf->bluestore_cache_trim_interval;
  cond.WaitInterval(lock, wait);
}
stop = false;
return NULL;

（1）计算缓存中onode得个数，这是通过mempool::bluestore_cache_onode::allocated_items()实现的。同时计算元数据的大小这是通过mempool::bluestore_cache_other::allocated_bytes()和mempool::bluestore_cache_onode::allocated_bytes()来实现的。
（2）计算每一个shard的期望大小和每个onode的平均元数据大小。
（3）对于每一个shard调用BlueStore::Cache::trim来计算所要保留的onode的个数和最大缓存数据的大小，然后调用BlueStore::TwoQCache::_trim去做真正的删除。
（4）TwoQCache有三个缓存队列，其中buffer_hot保存热数据，buffer_warm_in保存最新添加的数据，buffer_warm_out保存从buffer_warm_in中淘汰出来的数据。其中buffer_max代表了buffer_warm_in和buffer_hot之和的最大值，bluestore_2q_cache_kin_ratio代表buffer_warm_in所占buffer_max的比例，bluestore_2q_cache_kout_ratio代表了buffer_warm_out所占buffer_max的比例。
（5）从buffer_warm_in队列末尾开始删除多余的数据，删除的数据会直接加入到buffer_warm_out的头部。
（6）从buffer_hot末尾开始删除对于的数据，这里的数据会直接从buffer_hot队列中删除。同理对于buffer_warm_out的删除，其行为是和buffer_hot一样的。
（7）删除onode_lru中多余的onode。这里值得注意的是对于删除的onode缓存，也要从Collection的onode_map中删除。

帮我起个网名

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
ceph BlueStore缓存解析

ceph bluestore缓存中保存的是object的信息，包括元数据和实际数据，元数据是用LRUCache实现的，实际数据是用TwoQCache实现的。#define P(x) \ namespace x { \ static const mempool::pool_index_t id = mempool::mempool_##x; \ te...
复制链接

扫一扫