在读请求到达后_do_read函数会调用如下函数来处理具体的读操作,其中op.extent.offset和op.extent.length是在对象内的偏移和长度
pgbackend->objects_read_sync(soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata)‘
’在objects_read_sync函数中会直接调用
store->read(ch, ghobject_t(hoid), off, len, *bl, op_flags);
来针对不同的存储引擎来处理,如果是Bluestore则,会调用bluestore中的read函数。
bluestore::read的函数的调用栈如下
store->read(ch, ghobject_t(hoid), off, len, *bl, op_flags);
Collection *c = static_cast<Collection *>(c_.get());
const coll_t &cid = c->get_cid();
RWLock::RLocker l(c->lock);
OnodeRef o = c->get_onode(oid, false); //获取对象的onode信息
OnodeRef o = onode_map.lookup(oid);
ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
cache->_touch_onode(p->second); //BlueStore::LRUCache::_touch_onode
auto p = onode_lru.iterator_to(*o);
onode_lru.erase(p);
onode_lru.push_front(*o);
o = p->second;
if(o) return o;
//缓存中没有onode,从磁盘中找
get_object_key(store->cct, oid, &key); //获取对象的key
store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v); //“O” + key 从rocksdb中获取对象的onode信息
if(v.length() == 0)
on = new Onode(this, oid, key);
else
on = new Onode(this, oid, key);
on->exists = true; //磁盘中存在该onode
bufferptr::iterator p = v.front().begin_deep();
on->onode.decode(p); //解析从rocksdb中读取出来的onode信息
o.reset(on);
return onode_map.add(oid, o); //插入到onode_map中,并返回onode
if (!o || !o->exists) //onode不存在
r = -ENOENT;
goto out;
if (offset == length && offset == 0) //读全部
length = o->onode.size;
_do_read(c, o, offset, length, bl, op_flags);
if (offset + length > o->onode.size)
length = o->onode.size - offset;
unsigned left = length; //要读的长度
uint64_t pos = offset; //读取的数据在对象内的起始偏移
unsigned num_regions = 0;
auto lp = o->extent_map.seek_lextent(offset);
Extent dummy(offset);
auto fp = extent_map.lower_bound(dummy);//extent重载了比较运算符,以logical_offset比较
if (fp != extent_map.begin())
--fp;
if (fp->logical_end() <= offset)
++fp;
//上面代码是找到第一个包含offset的逻辑extent,即logical_offset<=offset<=logical_end()
while (left > 0 && lp != o->extent_map.extent_map.end()) //遍历后面的extent
if (pos < lp->logical_offset)
unsigned hole = lp->logical_offset - pos;
if (hole >= left)
break;
pos += hole;
left -= hole
BlobRef& bptr = lp->blob;
unsigned l_off = pos - lp->logical_offset; //pos可能大于logical_offset,l_off 就是获得在pos在当前extent内开始的读取位置
unsigned b_off = l_off + lp->blob_offset; //加上在blob中的偏移,就得到在blob中的偏移
unsigned b_len = std::min(left, lp->length - l_off);
bptr->shared_blob->bc.read(bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval);
//在一个逻辑extent内查找,最大为b_len,因为b_len不可能大于一个逻辑extent的大小
while (b_len > 0)
blobs2read[bptr].emplace_back(region_t(pos, b_off, l)); //pos为要读取数据在对象内开始的逻辑地址,b_off为要读取数据在blob内的偏移,l为在本次extent内读取的大小
++num_regions;
pos += l;
b_off += l;
left -= l;
b_len -= l;
for (auto& p : blobs2read)
const BlobRef& bptr = p.first;
for (auto& reg : p.second)
uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
//往前往后扩展
reg.r_off = reg.blob_xoffset; //对应上面的b_off
uint64_t r_len = reg.length;
reg.front = reg.r_off % chunk_size;
//因为一次读取的起始和结束地址可能不是block_size对其的,因此需要往前往后扩展,以达到对其
if (reg.front)
reg.r_off -= reg.front;
r_len += reg.front; //在前面多读一些保持块对其,因为b_off插入到blobs2read时候就不是对其的
unsigned tail = r_len % chunk_size;
if (tail)
r_len += chunk_size - tail; //在后面多读一些
r = bptr->get_blob().map(
reg.r_off, r_len,
[&](uint64_t offset, uint64_t length) {
int r;
// use aio if there is more than one region to read
if (num_regions > 1) {
r = bdev->aio_read(offset, length, ®.bl, &ioc); //异步IO读取数据
ioc->pending_aios.push_back(aio_t(ioc, fd_direct));//将请求push到pending_aios
++ioc->num_pending;
aio_t& aio = ioc->pending_aios.back();
aio.pread(off, len);
io_prep_pread(&iocb, fd, p.c_str(), length, offset);
bl.append(std::move(p));
pbl->append(aio.bl);
} else {
r = bdev->read(offset, length, ®.bl, &ioc, false);
}
if (r < 0)
return r;
return 0;
});
//bluestore_blob_t的map函数如下
/**
auto p = extents.begin(); //pextent,结合blob中的偏移,再加上pextent中的磁盘偏移,就可以确定准确位置
assert(p != extents.end());
while (x_off >= p->length) { //找到x_off属于的pextent,x_off是blob内的偏移,p->length为这个pextent的长度,最后while循环退出时,x_off的值就是在当前pextent内的起始地址偏移
x_off -= p->length;
++p;
assert(p != extents.end());
}
while (x_len > 0) {
assert(p != extents.end());
uint64_t l = std::min(p->length - x_off, x_len); //该extent最大只能包括x_len或者p->length-x_off空间
int r = f(p->offset + x_off, l); //p->offset就是相对于磁盘的物理偏移,x_off是相对于当前pextent的偏移,两者相加就得到起始地址在磁盘内的物理偏移,然后调用传进来的函数去读取该区域的磁盘数据
if (r < 0)
return r;
x_off = 0;
x_len -= l;
++p;
}
**/
if (ioc.has_pending_aios()) //return num_pending.load();
bdev->aio_submit(&ioc);
int pending = ioc->num_pending.load();
list<aio_t>::iterator e = ioc->running_aios.begin();
ioc->running_aios.splice(e, ioc->pending_aios); //将pending_aios中的事件插入到e之前,则从running_aiso.begin到e都是新插入的需要提交的事件
ioc->num_running += pending;
ioc->num_pending -= pending;
void *priv = static_cast<void*>(ioc);
aio_queue.submit_batch(ioc->running_aios.begin(), e, pending, priv, &retries);
struct iocb *piocb[aios_size]; //aios_size就是pending,即等待submit的大小
int left = 0;
while (cur != end)
cur->priv = priv;
*(piocb+left) = &cur->iocb;
++left;
++cur;
while (left > 0)
io_submit(ctx, std::min(left, max_iodepth), piocb + done);
done += r;
left -= r;
ioc.aio_wait(); //等待异步IO事件完成
while (num_running.load() > 0)
cond.wait(l);
ioc.get_return_value();
return r;
blobs2read_t::iterator b2r_it = blobs2read.begin();
while (b2r_it != blobs2read.end())
const BlobRef& bptr = b2r_it->first;
for (auto& reg : b2r_it->second)
if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,reg.logical_offset) < 0) {
return -EIO;
auto pr = ready_regions.begin(); //typedef map<uint64_t, bufferlist> ready_regions_t;
auto pr_end = ready_regions.end();
pos = 0;
while (pos < length)
if (pr != pr_end && pr->first == pos + offset)
pos += pr->second.length();
bl.claim_append(pr->second); //读取全部
++pr;
else
uint64_t l = length - pos;
l = pr->first - (pos + offset);
bl.append_zero(l);
pos += l; //这个的起始地址大,说明有空洞