ceph bluestore中读操作的处理

在读请求到达后_do_read函数会调用如下函数来处理具体的读操作,其中op.extent.offset和op.extent.length是在对象内的偏移和长度

pgbackend->objects_read_sync(soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata)‘

’在objects_read_sync函数中会直接调用

store->read(ch, ghobject_t(hoid), off, len, *bl, op_flags);

来针对不同的存储引擎来处理,如果是Bluestore则,会调用bluestore中的read函数。
bluestore::read的函数的调用栈如下

store->read(ch, ghobject_t(hoid), off, len, *bl, op_flags);
	Collection *c = static_cast<Collection *>(c_.get());
	const coll_t &cid = c->get_cid();
	RWLock::RLocker l(c->lock);
	OnodeRef o = c->get_onode(oid, false); //获取对象的onode信息
		OnodeRef o = onode_map.lookup(oid);
			ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
			cache->_touch_onode(p->second);	//BlueStore::LRUCache::_touch_onode
				auto p = onode_lru.iterator_to(*o);
				onode_lru.erase(p);
				onode_lru.push_front(*o);
			o = p->second;
		if(o) return o;
		//缓存中没有onode,从磁盘中找
		get_object_key(store->cct, oid, &key);   //获取对象的key
		store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);	//“O” + key 从rocksdb中获取对象的onode信息
		if(v.length() == 0)
			on = new Onode(this, oid, key);
		else
			on = new Onode(this, oid, key);
			on->exists = true; //磁盘中存在该onode
			bufferptr::iterator p = v.front().begin_deep();
			on->onode.decode(p); //解析从rocksdb中读取出来的onode信息
		o.reset(on);
		return onode_map.add(oid, o);  //插入到onode_map中,并返回onode
	if (!o || !o->exists)	//onode不存在
		r = -ENOENT;
		goto out;	
	if (offset == length && offset == 0)  //读全部
		length = o->onode.size;
	_do_read(c, o, offset, length, bl, op_flags);
		if (offset + length > o->onode.size)
			length = o->onode.size - offset;
		unsigned left = length; //要读的长度
		uint64_t pos = offset; //读取的数据在对象内的起始偏移
		unsigned num_regions = 0;	
		auto lp = o->extent_map.seek_lextent(offset);
			Extent dummy(offset);
			auto fp = extent_map.lower_bound(dummy);//extent重载了比较运算符,以logical_offset比较
			if (fp != extent_map.begin())  
				--fp;
				if (fp->logical_end() <= offset)
					++fp;
			//上面代码是找到第一个包含offset的逻辑extent,即logical_offset<=offset<=logical_end()
		while (left > 0 && lp != o->extent_map.extent_map.end()) //遍历后面的extent
			if (pos < lp->logical_offset)
				unsigned hole = lp->logical_offset - pos;
				if (hole >= left)
					break;
				pos += hole;
				left -= hole
			BlobRef& bptr = lp->blob;	
			unsigned l_off = pos - lp->logical_offset;	 //pos可能大于logical_offset,l_off 就是获得在pos在当前extent内开始的读取位置
			unsigned b_off = l_off + lp->blob_offset;  //加上在blob中的偏移,就得到在blob中的偏移
			unsigned b_len = std::min(left, lp->length - l_off);
			bptr->shared_blob->bc.read(bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval);
			//在一个逻辑extent内查找,最大为b_len,因为b_len不可能大于一个逻辑extent的大小
			while (b_len > 0)
				blobs2read[bptr].emplace_back(region_t(pos, b_off, l)); //pos为要读取数据在对象内开始的逻辑地址,b_off为要读取数据在blob内的偏移,l为在本次extent内读取的大小
				++num_regions;
		 pos += l;
		 b_off += l;	
		 left -= l;
		 b_len -= l;		
	
		for (auto& p : blobs2read)
			const BlobRef& bptr = p.first;
			for (auto& reg : p.second)
				uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
				//往前往后扩展
				reg.r_off = reg.blob_xoffset; //对应上面的b_off 
				uint64_t r_len = reg.length;  
				reg.front = reg.r_off % chunk_size; 
				//因为一次读取的起始和结束地址可能不是block_size对其的,因此需要往前往后扩展,以达到对其
				if (reg.front) 
					reg.r_off -= reg.front;
					r_len += reg.front; //在前面多读一些保持块对其,因为b_off插入到blobs2read时候就不是对其的
				unsigned tail = r_len % chunk_size;
				if (tail)
					r_len += chunk_size - tail; //在后面多读一些
				r = bptr->get_blob().map(
				  reg.r_off, r_len,
				  [&](uint64_t offset, uint64_t length) {
					int r;
					// use aio if there is more than one region to read
					if (num_regions > 1) {
						r = bdev->aio_read(offset, length, &reg.bl, &ioc); //异步IO读取数据
							ioc->pending_aios.push_back(aio_t(ioc, fd_direct));//将请求push到pending_aios
							++ioc->num_pending;
							aio_t& aio = ioc->pending_aios.back();
							aio.pread(off, len);
								io_prep_pread(&iocb, fd, p.c_str(), length, offset);  
								bl.append(std::move(p));
							pbl->append(aio.bl);
					} else {
					  r = bdev->read(offset, length, &reg.bl, &ioc, false);
					}
					if (r < 0)
						  return r;
						return 0;
				  });
				  //bluestore_blob_t的map函数如下
				  /**
					auto p = extents.begin(); //pextent,结合blob中的偏移,再加上pextent中的磁盘偏移,就可以确定准确位置
					assert(p != extents.end());
					while (x_off >= p->length) {  //找到x_off属于的pextent,x_off是blob内的偏移,p->length为这个pextent的长度,最后while循环退出时,x_off的值就是在当前pextent内的起始地址偏移
					  x_off -= p->length;
					  ++p;
					  assert(p != extents.end());
					} 
					while (x_len > 0) {
					  assert(p != extents.end());
					  uint64_t l = std::min(p->length - x_off, x_len); //该extent最大只能包括x_len或者p->length-x_off空间
					  int r = f(p->offset + x_off, l); //p->offset就是相对于磁盘的物理偏移,x_off是相对于当前pextent的偏移,两者相加就得到起始地址在磁盘内的物理偏移,然后调用传进来的函数去读取该区域的磁盘数据
					  if (r < 0)
						return r;
					  x_off = 0;
					  x_len -= l;
					  ++p;
					}					  
				  
				  
				  **/
				
		if (ioc.has_pending_aios())	  //return num_pending.load();
			bdev->aio_submit(&ioc);
				int pending = ioc->num_pending.load();
				list<aio_t>::iterator e = ioc->running_aios.begin();
				ioc->running_aios.splice(e, ioc->pending_aios);  //将pending_aios中的事件插入到e之前,则从running_aiso.begin到e都是新插入的需要提交的事件
				ioc->num_running += pending;
				ioc->num_pending -= pending;
				void *priv = static_cast<void*>(ioc);
				aio_queue.submit_batch(ioc->running_aios.begin(), e, pending, priv, &retries);
					struct iocb *piocb[aios_size];  //aios_size就是pending,即等待submit的大小
					int left = 0;
					while (cur != end)
						cur->priv = priv;
						*(piocb+left) = &cur->iocb;
						++left;
						++cur;
					while (left > 0)
						io_submit(ctx, std::min(left, max_iodepth), piocb + done);
						done += r;
						left -= r;
			ioc.aio_wait(); //等待异步IO事件完成
				while (num_running.load() > 0)
					cond.wait(l);
			ioc.get_return_value();
				return r;
			blobs2read_t::iterator b2r_it = blobs2read.begin();
			while (b2r_it != blobs2read.end())
				const BlobRef& bptr = b2r_it->first;
				for (auto& reg : b2r_it->second) 
					if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,reg.logical_offset) < 0) {
						return -EIO;
			auto pr = ready_regions.begin(); //typedef map<uint64_t, bufferlist> ready_regions_t;
			auto pr_end = ready_regions.end();
			pos = 0;
			while (pos < length)
				if (pr != pr_end && pr->first == pos + offset)  
					pos += pr->second.length();
					bl.claim_append(pr->second);  //读取全部
					++pr;
				else
					uint64_t l = length - pos;
					l = pr->first - (pos + offset);
					bl.append_zero(l);
					pos += l;  //这个的起始地址大,说明有空洞
  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值