librbd 缓存管理之 readx / writex 详解

最新推荐文章于 2022-07-14 10:17:46 发布

MrSate

最新推荐文章于 2022-07-14 10:17:46 发布

阅读量1.5k

点赞数 1

文章标签： ceph

本文链接：https://blog.csdn.net/MrSate/article/details/119430377

版权

librbd 缓存管理之 readx / writex 详解

librbd 缓存管理之 readx / writex 详解
概述
readx / writex

librbd 缓存管理之 readx / writex 详解

概述

readx / writex 中主要实现了缓存的读写的主要逻辑，如果 librbd 开启了缓存，读写操作将有这两个函数完成. 本文主要讲述的就是这两个函数的逻辑.

readx / writex

readx / writex 的调用

首先看一下这两个函数的调用流程，以 writex 为例，其由 ObjectCacherObjectDispatch<I>::write() 调用, write() 主要负责必要的参数和回调函数的组装与创建。

ObjectDispatcher<I>::send() 当执行到 m_object_dispatches 中的 OBJECT_DISPATCH_LAYER_CACHE 的 layer 时，object_dispatch_spec->request 的类型为 ObjectDispatchSpec::WriteRequest 就会有 boost::apply_visitor 重载到 SendVisitor 中的 WriteRequest 类型的 operator() 操作符，从而调用到 ObjectCacherObjectDispatch<I>::write()， WriteRequest 本身由 ObjectDispatchSpec* create_write 在构造请求时候创建 (ImageWriteRequest<librbd::ImageCtx>::create_object_request)

template <typename I>
bool ObjectCacherObjectDispatch<I>::write(/**/) {
  auto cct = m_image_ctx->cct;

  // ensure we aren't holding the cache lock post-write
  on_dispatched = util::create_async_context_callback(*m_image_ctx, on_dispatched);
    // 将 on_dispatched 放入 image_ctx.op_work_queue 以便后续执行, 这里的 on_dispatched 对应着 req
                // vvvvvvvvvvvvvvvvvvvv
                    template <typename I>
                    Context *create_async_context_callback(I &image_ctx, Context *on_finish) {
                      return new detail::C_AsyncCallback<
                        typename std::decay<decltype(*image_ctx.op_work_queue)>::type>(
                          image_ctx.op_work_queue, on_finish);
                    }
               // ^^^^^^^^^^^^^^^^

//...
  ObjectCacher::OSDWrite *wr = m_object_cacher->prepare_write(snapc, data, ceph::real_time::min(), op_flags, *journal_tid);
    		// vvvvvvvvvvvvvvvvv prepare_write()
                  OSDWrite *prepare_write(const SnapContext& sc,
                          const bufferlist &b,
                          ceph::real_time mt,
                          int f,
                          ceph_tid_t journal_tid) const {
                	return new OSDWrite(sc, b, mt, f, journal_tid);
              }
    		// ^^^^^^^^^^^^^^^^^^
  //...
  ObjectExtent extent(oid, 0, object_off, data.length(), 0);
  extent.oloc.pool = m_image_ctx->data_ctx.get_id();
  extent.buffer_extents.push_back({0, data.length()});
  wr->extents.push_back(extent); // 将要读写的那段数据放入 extents 注意这里只有一段

  *dispatch_result = io::DISPATCH_RESULT_COMPLETE;

  m_cache_lock.Lock(); // 每个 Object 自己的锁
  m_object_cacher->writex(wr, m_object_set, on_dispatched, &trace);
  m_cache_lock.Unlock();
  return true;
}

writex

看一下 ObjectCacher::writex

writex 中首先会根据要写的对象的 oid （file_to_extend() 算出来的）和 pool id 通过 get_object() 获取会创建对应 oid 的缓存数据 Object *o, 然后通过 o->map_write() 在这段缓存中找到要读写的那段数据（如果不存在就创建）, 得到 BufferHead *bh 然后将这段缓存标记为 BufferHead::STATE_DIRTY 状态，将数据写入 bh 后，最后在 _wait_for_write() 中继续进行写操作（直接写透到后端或者将请求放到异步队列中）

int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
			 ZTracer::Trace *parent_trace)
{
  ceph_assert(lock.is_locked());
  ceph::real_time now = ceph::real_clock::now();
  uint64_t bytes_written = 0;
  uint64_t bytes_written_in_flush = 0;
  bool dontneed = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
  bool nocache = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE;

  ZTracer::Trace trace;
  if (parent_trace != nullptr) {
    trace.init("write", &trace_endpoint, parent_trace);
    trace.event("start");
  }

  for (vector<ObjectExtent>::iterator ex_it = wr->extents.begin();
       ex_it != wr->extents.end();
       ++ex_it) {
    // get object cache
    sobject_t soid(ex_it->oid, CEPH_NOSNAP);
    // 获取或者新建一个 Object, 其详细结构和解析可以看  https://blog.csdn.net/MrSate/article/details/118464146
    Object *o = get_object(soid, ex_it->objectno, oset, ex_it->oloc,
			   ex_it->truncate_size, oset->truncate_seq);

    // 得到一段 BUfferHead , 这段数据就是要写的那段缓存数据段, 并且吧这段缓存加入到 Object::ObjectCacher 中
    BufferHead *bh = o->map_write(*ex_it, wr->journal_tid);
    bool missing = bh->is_missing(); // missing 指完全新的一段 BufferHead, 未修改任何现有的缓存数据, 例如下文图中 ex.2 中的 final 就是 missing 状态的
    bh->snapc = wr->snapc;

    bytes_written += ex_it->length;
    if (bh->is_tx()) {
      bytes_written_in_flush += ex_it->length;
    }

    // adjust buffer pointers (ie "copy" data into my cache)
    // this is over a single ObjectExtent, so we know that
    //  - there is one contiguous bh
    //  - the buffer frags need not be (and almost certainly aren't)
    // note: i assume striping is monotonic... no jumps backwards, ever!
    loff_t opos = ex_it->offset;
    for (vector<pair<uint64_t, uint64_t> >::iterator f_it
	   = ex_it->buffer_extents.begin();
	 f_it != ex_it->buffer_extents.end();
	 ++f_it) {
      ldout(cct, 10) << "writex writing " << f_it->first << "~"
		     << f_it->second << " into " << *bh << " at " << opos
		     << dendl;
      uint64_t bhoff = opos - bh->start();
      ceph_assert(f_it->second <= bh->length() - bhoff);

      // get the frag we're mapping in
      bufferlist frag;
      frag.substr_of(wr->bl, f_it->first, f_it->second);

      if (!bhoff) // 数据写入 bh 中
        bh->bl.swap(frag);
      else
        bh->bl.claim_append(frag);

      opos += f_it->second;
    }

    // ok, now bh is dirty.
    mark_dirty(bh);
    if (dontneed)
      bh->set_dontneed(true);
    else if (nocache && missing) // 设置了 nocache 并且缓存是 misssing 的, 这段数据可以不 cache 在 Object 中?
      bh->set_nocache(true);
    else
      touch_bh(bh); // 提高缓存地位
    bh->last_write = now;
    o->try_merge_bh(bh);  // 合并相邻的缓存
  }

  if (perfcounter) {
    perfcounter->inc(l_objectcacher_data_written, bytes_written);
    if (bytes_written_in_flush) {
      perfcounter->inc(l_objectcacher_overwritten_in_flush,
		       bytes_written_in_flush);
    }
  }

  int r = _wait_for_write(wr, bytes_written, oset, &trace, onfreespace); // 继续往后调度
  delete wr;

  //verify_stats();
  trim();
  return r;
}

map_write

ObjectCacher::Object::map_write() 的主要工作就是从 map<loff_t, BufferHead*> Object::data 中找到当前要写的这段数据是否有和现有的缓存数据重合的部分，有的话就将这部分数据拆出来，没有就创建对应的缓存空间，最终会得到一个段完整的连续的缓存

下图列出了几种可能情况， extent 就是要写的那段数据，绿色的部分代表已经缓存了的数据，蓝色的部分代表 map_write()的返回值，也就是最终映射出来的那段要写的缓存

在这里插入图片描述


ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex,
                       ceph_tid_t tid)
{
  ceph_assert(oc->lock.is_locked());
  BufferHead *final = 0;  // final 就是最终要进行写的那段缓存数据

  ldout(oc->cct, 10) << "map_write oex " << ex.oid
              << " " << ex.offset << "~" << ex.length << dendl;

  loff_t cur = ex.offset;
  loff_t left = ex.length; // 这里的 left 是剩余长度, 剩余还未计算的长度
  /* data_lower_bound 返回包含 ex.offset 位置的本地缓存, 没有包含的就返回 ex.offset 后面 (地址大于 ex.offset) 的本地缓存, 在没有就返回 end(), 类似于 stl::map::lower_bound.
  例如: 当前的 Object 已经包含了三段内存片段, 
  [[0, 10], [100, 200], [4000, 4010]]
  如果 ex.offset 为 0 那么就会返回指向 [0, 10] 这段数据的指针
  如果 ex.offset 为 1 那么就会返回指向 [0, 10] 这段数据的指针
  如果 ex.offset 为 11 那么就会返回指向 [100, 200] 这段数据的指针
  如果 ex.offset 为 4011 那么就会返回 data.end()
  */
  map<loff_t, BufferHead*>::const_iterator p = data_lower_bound(ex.offset);
    /* vvvvvvvvvvvvvvvvvvvvvvvvvvv   data_lower_bound
             map<loff_t,BufferHead*>::const_iterator data_lower_bound(loff_t offset) const {
                  map<loff_t,BufferHead*>::const_iterator p = data.lower_bound(offset);
                  if (p != data.begin() && (p == data.end() || p->first > offset)) {
                        --p;     // might overlap!
                  		if (p->first + p->second->length() <= offset)
                  			++p;   // doesn't overlap.
                  }
                  return p;
              }
     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    */
    
  // 这里的循环就是将所有数据与内存现有数据进行映射, 注意上面 p 的"循环不变量", 为与当前要进行映射的这段数据 p "相交或后面最近的一段缓存片段(无相交数据时)".
  while (left > 0) {
    loff_t max = left;

    if (p == data.end()) { // p 后面没有已存在的缓存了, 直接创建一段返回就行
      if (final == NULL) {
        final = new BufferHead(this);
        replace_journal_tid(final, tid);
        final->set_start( cur );
        final->set_length( max );
        oc->bh_add(this, final);
        ldout(oc->cct, 10) << "map_write adding trailing bh " << *final << dendl;
      } else { 
        oc->bh_stat_sub(final);
        final->set_length(final->length() + max);
        oc->bh_stat_add(final);
      }
      left -= max;  
      cur += max;
      continue; // 没有下次循环了 left == max
    }

    ldout(oc->cct, 10) << "cur is " << cur << ", p is " << *p->second << dendl;
    //oc->verify_stats();

    if (p->first <= cur) { // p 的定义为与 cur 相交或后一个已有的缓存, 如果已有缓存起始位置小于等于 cur, 说明 cur 与 p 一定是相交的, 上图中(ex4, ex5)
      BufferHead *bh = p->second;
      ldout(oc->cct, 10) << "map_write bh " << *bh << " intersected" << dendl;

      if (p->first < cur) { 
        ceph_assert(final == 0);
        if (cur + max >= bh->end()) { // extent 长度超过 bh, shangtu1 ex.4 第一次循环会到这
          // we want right bit (one splice)
          final = split(bh, cur);   // just split it, take right half.
          maybe_rebuild_buffer(bh);
          replace_journal_tid(final, tid);
          ++p;
          ceph_assert(p->second == final);
        } else {  // extent 小于 bh, 上图中 ex5
          // we want middle bit (two splices)
          final = split(bh, cur);
          maybe_rebuild_buffer(bh);
          ++p;
          ceph_assert(p->second == final);
          auto right = split(final, cur+max);
          maybe_rebuild_buffer(right);
          replace_journal_tid(final, tid);
        }
      } else {
        ceph_assert(p->first == cur); // extent 与 bh 完全重合
        if (bh->length() <= max) {
          // whole bufferhead, piece of cake.
        } else {
          // we want left bit (one splice)
          auto right = split(bh, cur + max);        // just split
          maybe_rebuild_buffer(right);
        }
        if (final) { // p 位置与 cur 重合并且 有 final 例如 ex.3 第二次循环会到这
          oc->mark_dirty(bh); // 这里的 dirty 状态不重要, 因为 finial 争端都会再次被调用者设置为 dirty, 这里只是为了 merge_left()
          oc->mark_dirty(final);
          --p;  // move iterator back to final
          ceph_assert(p->second == final);
          replace_journal_tid(bh, tid);
          merge_left(final, bh); // 合并 final 和 bh , ex.3 中 finial 就是从这合并来的
        } else {
          final = bh;
          replace_journal_tid(final, tid);
        }
      }

      // keep going.
      loff_t lenfromcur = final->end() - cur;
      cur += lenfromcur;
      left -= lenfromcur;
      ++p;
      continue;
    } else { // cur 位置与现有缓存无重和, 上图 ex1 ex2 ex3
      // gap!
      loff_t next = p->first;
      loff_t glen = std::min(next - cur, max); // 不要与后面的缓存重合
      ldout(oc->cct, 10) << "map_write gap " << cur << "~" << glen << dendl;
      if (final) {
        oc->bh_stat_sub(final);
        final->set_length(final->length() + glen); // 直接延长 final , ex.4 次循环就会到这
        oc->bh_stat_add(final);
      } else {
        final = new BufferHead(this);  // 与 cur 有差距, 且需要新建一段 BufferHead , ex.3 第一次循环会到这
   replace_journal_tid(final, tid);
        final->set_start( cur );
        final->set_length( glen );
        oc->bh_add(this, final);
      }

      cur += glen;
      left -= glen;
      continue;    // more?
    }
  }

  // set version
  ceph_assert(final);
  ceph_assert(final->get_journal_tid() == tid);
  ldout(oc->cct, 10) << "map_write final is " << *final << dendl;

  return final;  // 最终要写的那段数据
}

_wait_for_write

ObjectCacher::_wait_for_write()主要负责向后端继续写数据或者将C_WaitForWrite 放入 queue 中等待后续异步的处理

// blocking wait for write.
int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset,
				  ZTracer::Trace *trace, Context *onfreespace)
{
  ceph_assert(lock.is_locked());
  ceph_assert(trace != nullptr);
  int ret = 0;

    // 默认情况下, 只有成功执行过 ObjectCacherObjectDispatch<I>::flush() 之后 max_dirty 才会被设置成大于 0, 这样可以防止后端不支持 flush 的时候数据丢失
  if (max_dirty > 0 && !(wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_FUA)) {
    // 写完缓存就返回的分支
    if (block_writes_upfront) { // block_writes_upfront 对应配置中 rbd_cache_block_writes_upfront 参数默认为 false， 缓存满时候是否阻塞写
      maybe_wait_for_writeback(len, trace); 
        // vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv  maybe_wait_for_writeback()
                   void ObjectCacher::maybe_wait_for_writeback(uint64_t len,ZTracer::Trace *trace) {
                      while (/*缓存是满的*/) {
                            flusher_cond.Signal();  // 激活 flusher 线程下刷缓存
                            //...
                            stat_cond.Wait(lock);  // 等待 flusher 刷完缓存通知
                      }
              //..
       // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
      if (onfreespace)
		onfreespace->complete(0);
    } else {  // 缓存满了也不阻塞, 把请求放入 finisher 线程异步处理
          ceph_assert(onfreespace); // todo onfreespace ？？
          finisher.queue(new C_WaitForWrite(this, len, *trace, onfreespace));
    }
  } else {
    // 写透, 数据完全写到后端
        Cond cond;
        bool done = false;
        Context *fin = block_writes_upfront ?  new C_Cond(&cond, &done, &ret) : onfreespace;
        ceph_assert(fin);
        bool flushed = flush_set(oset, wr->extents, trace, fin);  // 写后端
        ceph_assert(!flushed);   // we just dirtied it, and didn't drop our lock!
    if (block_writes_upfront) { // rbd_cache_block_writes_upfront
      while (!done)
        cond.Wait(lock); // 阻塞等待 flusher 线程
      	ldout(cct, 10) << "wait_for_write woke up, ret " << ret << dendl;
      	if (onfreespace)
        	onfreespace->complete(ret);
    }
  }

  // start writeback anyway?
  if (get_stat_dirty() > 0 && (uint64_t) get_stat_dirty() > target_dirty) {
    flusher_cond.Signal(); // 通知 flusher 下刷缓存
  }
  return ret;
}

_readx

ObjectCacher::_readx() 逻辑流程与 ObjectCacher::_writex() 类似 _readx() 也是先要根据 oid 和 pool id 获取缓存，然后对缓存进行 map ，不过 map_read() 是会找到所有处于 hits, missing, rx, errors 状态的缓存, 然后对于不存在的缓存 missing, rx 会注册一个 C_RetryRead 回调然后将 success 设置为 false 这样会在读取数据之前从 _readx() 中返回, 等数据读上来之后通过之前注册的回调再次执行 _readx() 这时候所有需要的缓存就都是 hit 状态的了

int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
          bool external_call, ZTracer::Trace *trace)
{
  ceph_assert(trace != nullptr);
  ceph_assert(lock.is_locked());
  bool success = true;
  int error = 0;
  uint64_t bytes_in_cache = 0;
  uint64_t bytes_not_in_cache = 0;
  uint64_t total_bytes_read = 0;
  map<uint64_t, bufferlist> stripe_map;  // final buffer offset -> substring
  bool dontneed = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
  bool nocache = rd->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE;

  /*
   * WARNING: we can only meaningfully return ENOENT if the read request
   * passed in a single ObjectExtent.  Any caller who wants ENOENT instead of
   * zeroed buffers needs to feed single extents into readx().
   */
  ceph_assert(!oset->return_enoent || rd->extents.size() == 1);

  for (vector<ObjectExtent>::iterator ex_it = rd->extents.begin();
       ex_it != rd->extents.end();
       ++ex_it) {
    ldout(cct, 10) << "readx " << *ex_it << dendl;

    total_bytes_read += ex_it->length;

    // get Object cache
    sobject_t soid(ex_it->oid, rd->snap);
    Object *o = get_object(soid, ex_it->objectno, oset, ex_it->oloc,
            ex_it->truncate_size, oset->truncate_seq);
    if (external_call)
      touch_ob(o);
// ....

    // map extent into bufferheads
    map<loff_t, BufferHead*> hits, missing, rx, errors;
    o->map_read(*ex_it, hits, missing, rx, errors);
    /* map_read 整体上与 map_write 相似, map_write 是找到要写的那块缓存数据, map_read 是找到想要读的缓存那部分是 hits, missing 的, 具体实现逻辑可以参照 map_read*/
    if (external_call) {
      // retry reading error buffers
      missing.insert(errors.begin(), errors.end());
    } else {
      // some reads had errors, fail later so completions
      // are cleaned up properly
      // TODO: make read path not call _readx for every completion
      hits.insert(errors.begin(), errors.end());
    }

    if (!missing.empty() || !rx.empty()) {
      // 把 missing 的读上来
      map<loff_t, BufferHead*>::iterator last = missing.end();
      for (map<loff_t, BufferHead*>::iterator bh_it = missing.begin(); bh_it != missing.end(); ++bh_it) {
     uint64_t rx_bytes = static_cast<uint64_t>(stat_rx + bh_it->second->length());
     bytes_not_in_cache += bh_it->second->length();
   if (!waitfor_read.empty() || (stat_rx > 0 && rx_bytes > max_size)) {
     // cache is full with concurrent reads -- wait for rx's to complete
     // to constrain memory growth (especially during copy-ups)
         if (success) {
            // success 初始为 true, 触发一个创建一个 C_RetryRead 等把缓存独上上来在从缓存二次读取数据
           waitfor_read.push_back(new C_RetryRead(this, rd, oset, onfinish,
                         *trace));
             // vvvvvvvvvvvvvvvvvv 
                        // C_RetryRead 就是创建一个 _readx 的回调
                         class ObjectCacher::C_RetryRead : public Context {
                              void finish(int r) override {
                                if (r >= 0) {
                                  r = oc->_readx(rd, oset, onfinish, false, &trace);
                                }
                                //...
                                if (onfinish) {
                                  onfinish->complete(r);  // todo onfinish init ??
                                }
                              }
                            };
             // ^^^^^^^^^^^^^^^^^^^
     }

     bh_remove(o, bh_it->second);
     delete bh_it->second;
   } else {
     bh_it->second->set_nocache(nocache);
     bh_read(bh_it->second, rd->fadvise_flags, *trace);
     if ((success && onfinish) || last != missing.end())
       last = bh_it;
   }
   success = false;
  }

      //add wait in last bh avoid wakeup early. Because read is order
      if (last != missing.end()) {
       last->second->waitfor_read[last->first].push_back(
         new C_RetryRead(this, rd, oset, onfinish, *trace) );
      }

      // 处理接收中的缓存
      for (map<loff_t, BufferHead*>::iterator bh_it = rx.begin(); bh_it != rx.end(); ++bh_it) {
          touch_bh(bh_it->second); // bump in lru, so we don't lose it.
          if (success && onfinish) {
              bh_it->second->waitfor_read[bh_it->first].push_back(
                  new C_RetryRead(this, rd, oset, onfinish, *trace) );
          }
          bytes_not_in_cache += bh_it->second->length();
          success = false;
      }

      for (map<loff_t, BufferHead*>::iterator bh_it = hits.begin(); bh_it != hits.end();  ++bh_it)
           //bump in lru, so we don't lose it when later read
           touch_bh(bh_it->second);

    } else {
      ceph_assert(!hits.empty());

      // make a plain list
      for (map<loff_t, BufferHead*>::iterator bh_it = hits.begin(); bh_it != hits.end();
      ++bh_it) {
           BufferHead *bh = bh_it->second;
           ldout(cct, 10) << "readx hit bh " << *bh << dendl;
           if (bh->is_error() && bh->error)
             error = bh->error;
           bytes_in_cache += bh->length();

           if (bh->get_nocache() && bh->is_clean())
             bh_lru_rest.lru_bottouch(bh);
           else
             touch_bh(bh);
           //must be after touch_bh because touch_bh set dontneed false
           if (dontneed &&
               ((loff_t)ex_it->offset <= bh->start() &&
                (bh->end() <=(loff_t)(ex_it->offset + ex_it->length)))) {
             bh->set_dontneed(true); //if dirty
             if (bh->is_clean())
               bh_lru_rest.lru_bottouch(bh);
           }
      }

      if (!error) {
           // create reverse map of buffer offset -> object for the
           // eventual result.  this is over a single ObjectExtent, so we
           // know that
           //  - the bh's are contiguous
           //  - the buffer frags need not be (and almost certainly aren't)
           loff_t opos = ex_it->offset;
           map<loff_t, BufferHead*>::iterator bh_it = hits.begin();
           ceph_assert(bh_it->second->start() <= opos);
           uint64_t bhoff = opos - bh_it->second->start();
           vector<pair<uint64_t,uint64_t> >::iterator f_it
             = ex_it->buffer_extents.begin();
           uint64_t foff = 0;
           while (1) {
             BufferHead *bh = bh_it->second;
             ceph_assert(opos == (loff_t)(bh->start() + bhoff));

             uint64_t len = std::min(f_it->second - foff, bh->length() - bhoff);
             ldout(cct, 10) << "readx rmap opos " << opos << ": " << *bh << " +"
                  << bhoff << " frag " << f_it->first << "~"
                  << f_it->second << " +" << foff << "~" << len
                  << dendl;

             bufferlist bit;
             // 将 hit 的数据放到 stripe_map 中,
             if (bh->is_zero()) {
               stripe_map[f_it->first].append_zero(len);
             } else {
               bit.substr_of(bh->bl,
              opos - bh->start(),
              len);
               stripe_map[f_it->first].claim_append(bit);
             }

             opos += len;
             bhoff += len;
             foff += len;
             if (opos == bh->end()) {
               ++bh_it;
               bhoff = 0;
             }
             if (foff == f_it->second) {
               ++f_it;
               foff = 0;
             }
             if (bh_it == hits.end()) break;
             if (f_it == ex_it->buffer_extents.end())
               break;
           }
           ceph_assert(f_it == ex_it->buffer_extents.end());
           ceph_assert(opos == (loff_t)ex_it->offset + (loff_t)ex_it->length);
      }

      if (dontneed && o->include_all_cached_data(ex_it->offset, ex_it->length))
     bottouch_ob(o);
    }
  }

  if (!success) {
    if (perfcounter && external_call) {
      perfcounter->inc(l_objectcacher_data_read, total_bytes_read);
      perfcounter->inc(l_objectcacher_cache_bytes_miss, bytes_not_in_cache);
      perfcounter->inc(l_objectcacher_cache_ops_miss);
    }
    if (onfinish) {
      ldout(cct, 20) << "readx defer " << rd << dendl;
    } else {
      ldout(cct, 20) << "readx drop " << rd << " (no complete, but no waiter)"
           << dendl;
      delete rd;
    }
    return 0;  // wait!
  }
  if (perfcounter && external_call) {
    perfcounter->inc(l_objectcacher_data_read, total_bytes_read);
    perfcounter->inc(l_objectcacher_cache_bytes_hit, bytes_in_cache);
    perfcounter->inc(l_objectcacher_cache_ops_hit);
  }

  // no misses... success!  do the read.
  ldout(cct, 10) << "readx has all buffers" << dendl;

  // ok, assemble into result buffer.
  uint64_t pos = 0;
  if (rd->bl && !error) {
     // 将 stripe_map 数据放到 rd->bl 中, 并返回给上层
    rd->bl->clear();
    for (map<uint64_t,bufferlist>::iterator i = stripe_map.begin();
    i != stripe_map.end();
    ++i) {
      ceph_assert(pos == i->first);
      ldout(cct, 10) << "readx  adding buffer len " << i->second.length()
           << " at " << pos << dendl;
      pos += i->second.length();
      rd->bl->claim_append(i->second); 
      ceph_assert(rd->bl->length() == pos);
    }
    ldout(cct, 10) << "readx  result is " << rd->bl->length() << dendl;
  } else if (!error) {
    ldout(cct, 10) << "readx  no bufferlist ptr (readahead?), done." << dendl;
    map<uint64_t,bufferlist>::reverse_iterator i = stripe_map.rbegin();
    pos = i->first + i->second.length();
  }

  // done with read.
  int ret = error ? error : pos;
  ldout(cct, 20) << "readx done " << rd << " " << ret << dendl;
  ceph_assert(pos <= (uint64_t) INT_MAX);

  delete rd;
  trim();
  return ret;
}