onode是bluestore中的元数据形式,由于bluestore直接写裸盘,因此需要onode来管理对象。本文就讲讲onode的缓存算法。
在bluestore的cache中存在着lru和twoq两种,但是关于onode元数据的cache采用的都是lru算法。
1.lru上已有元素在访问到时怎么到队首?
这部分得从get_onode讲起,在其中调用了
BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
lueStore::OnodeRef BlueStore::Collection::get_onode(
const ghobject_t& oid,
bool create)
{
assert(create ? lock.is_wlocked() : lock.is_locked());
spg_t pgid;
if (cid.is_pg(&pgid)) {
if (!oid.match(cnode.bits, pgid.ps())) {
lderr(store->cct) << __func__ << " oid " << oid << " not part of "
<< pgid << " bits " << cnode.bits << dendl;
ceph_abort();
}
}
OnodeRef o = onode_map.lookup(oid);
if (o)
return o;
mempool::bluestore_cache_other::string key;
get_object_key(store->cct, oid, &key);
ldout(store->cct, 20) << __func__ << " oid " << oid << " key "
<< pretty_binary_string(key) << dendl;
bufferlist v;
int r = store->db->get(PREFIX_OBJ, key.c_str(), key.size(), &v);
ldout(store->cct, 20) << " r " << r << " v.len " << v.length() << dendl;
Onode *on;
if (v.length() == 0) {
assert(r == -ENOENT);
if (!store->cct->_conf->bluestore_debug_misc &&
!create)
return OnodeRef();
// new object, new onode
on = new Onode(this, oid, key);
} else {
// loaded
assert(r >= 0);
on = new Onode(this, oid, key);
on->exists = true;
bufferptr::iterator p = v.front().begin_deep();
on->onode.decode(p);
for (auto& i : on->onode.attrs) {
i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_other);
}
// initialize extent_map
on->extent_map.decode_spanning_blobs(p);
if (on->onode.extent_map_shards.empty()) {
denc(on->extent_map.inline_bl, p);
on->extent_map.decode_some(on->extent_map.inline_bl);
on->extent_map.inline_bl.reassign_to_mempool(
mempool::mempool_bluestore_cache_other);
} else {
on->extent_map.init_shards(false, false);
}
}
o.reset(on);
return onode_map.add(oid, o);
}
再关注一下lookup函数,发现在其中当命中了cache时会调用cache中的
void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
BlueStore::OnodeRef BlueStore::OnodeSpace::lookup(const ghobject_t& oid)
{
ldout(cache->cct, 30) << __func__ << dendl;
OnodeRef o;
bool hit = false;
{
std::lock_guard<std::recursive_mutex> l(cache->lock);
ceph::unordered_map<ghobject_t,OnodeRef>::iterator p = onode_map.find(oid);
if (p == onode_map.end()) {
ldout(cache->cct, 30) << __func__ << " " << oid << " miss" << dendl;
} else {
ldout(cache->cct, 30) << __func__ << " " << oid << " hit " << p->second
<< dendl;
cache->_touch_onode(p->second);
hit = true;
o = p->second;
}
}
if (hit) {
cache->logger->inc(l_bluestore_onode_hits);
} else {
cache->logger->inc(l_bluestore_onode_misses);
}
return o;
}
于是在void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
将命中后的onode从队列中删除并将其添加到队首
void BlueStore::TwoQCache::_touch_onode(OnodeRef& o)
{
auto p = onode_lru.iterator_to(*o);
onode_lru.erase(p);
onode_lru.push_front(*o);
}
2.lru中元素是怎么增加的?
回到get_onode,该函数有两个参数,当未在onode_map中查找到对应的onode时,当create参数为false时直接返回OnodeRef。而当create参数为true时,则new一个onode对象并调用
BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
在onode_map中添加该onode,并调用cache中的方法
_add_onode(OnodeRef& o, int level)将其加到lru中
BlueStore::OnodeRef BlueStore::OnodeSpace::add(const ghobject_t& oid, OnodeRef o)
{
std::lock_guard<std::recursive_mutex> l(cache->lock);
auto p = onode_map.find(oid);
if (p != onode_map.end()) {
ldout(cache->cct, 30) << __func__ << " " << oid << " " << o
<< " raced, returning existing " << p->second
<< dendl;
return p->second;
}
ldout(cache->cct, 30) << __func__ << " " << oid << " " << o << dendl;
onode_map[oid] = o;
cache->_add_onode(o, 1);
return o;
}
将onode加入到lru中,源码中调用处的level都为1,所以都放在队首
void _add_onode(OnodeRef& o, int level) override {
if (level > 0)
onode_lru.push_front(*o);
else
onode_lru.push_back(*o);
}
3.什么时候对lru做trim??
void *BlueStore::MempoolThread::entry(),在其中能看到trim是定时启动的,其间隔时间是bluestore_cache_trim_interval默认是0.2s。从中也能看出,一个osd不只是有一个lru的onode,而是有多个lru。一个shard对应一个lru,默认hdd是有5个lru,而ssd稍多有8个。
void *BlueStore::MempoolThread::entry()
{
Mutex::Locker l(lock);
while (!stop) {
uint64_t meta_bytes =
mempool::bluestore_cache_other::allocated_bytes() +
mempool::bluestore_cache_onode::allocated_bytes();
uint64_t onode_num =
mempool::bluestore_cache_onode::allocated_items();
if (onode_num < 2) {
onode_num = 2;
}
float bytes_per_onode = (float)meta_bytes / (float)onode_num;
size_t num_shards = store->cache_shards.size();
float target_ratio = store->cache_meta_ratio + store->cache_data_ratio;
// A little sloppy but should be close enough
uint64_t shard_target = target_ratio * (store->cache_size / num_shards);
for (auto i : store->cache_shards) {
i->trim(shard_target,
store->cache_meta_ratio,
store->cache_data_ratio,
bytes_per_onode);
}
store->_update_cache_logger();
utime_t wait;
wait += store->cct->_conf->bluestore_cache_trim_interval;
cond.WaitInterval(lock, wait);
}
stop = false;
return NULL;
}
关注trim函数,可得到当满足current > target_bytes时才会去调用cache中的_trim
void BlueStore::Cache::trim(
uint64_t target_bytes,
float target_meta_ratio,
float target_data_ratio,
float bytes_per_onode)
{
std::lock_guard<std::recursive_mutex> l(lock);
uint64_t current_meta = _get_num_onodes() * bytes_per_onode;
uint64_t current_buffer = _get_buffer_bytes();
uint64_t current = current_meta + current_buffer;
uint64_t target_meta = target_bytes * target_meta_ratio;
uint64_t target_buffer = target_bytes * target_data_ratio;
// correct for overflow or float imprecision
target_meta = min(target_bytes, target_meta);
target_buffer = min(target_bytes - target_meta, target_buffer);
if (current <= target_bytes) {
dout(10) << __func__
<< " shard target " << pretty_si_t(target_bytes)
<< " meta/data ratios " << target_meta_ratio
<< " + " << target_data_ratio << " ("
<< pretty_si_t(target_meta) << " + "
<< pretty_si_t(target_buffer) << "), "
<< " current " << pretty_si_t(current) << " ("
<< pretty_si_t(current_meta) << " + "
<< pretty_si_t(current_buffer) << ")"
<< dendl;
return;
}
uint64_t need_to_free = current - target_bytes;
uint64_t free_buffer = 0;
uint64_t free_meta = 0;
if (current_buffer > target_buffer) {
free_buffer = current_buffer - target_buffer;
if (free_buffer > need_to_free) {
free_buffer = need_to_free;
}
}
free_meta = need_to_free - free_buffer;
// start bounds at what we have now
uint64_t max_buffer = current_buffer - free_buffer;
uint64_t max_meta = current_meta - free_meta;
uint64_t max_onodes = max_meta / bytes_per_onode;
dout(10) << __func__
<< " shard target " << pretty_si_t(target_bytes)
<< " ratio " << target_meta_ratio << " ("
<< pretty_si_t(target_meta) << " + "
<< pretty_si_t(target_buffer) << "), "
<< " current " << pretty_si_t(current) << " ("
<< pretty_si_t(current_meta) << " + "
<< pretty_si_t(current_buffer) << "),"
<< " need_to_free " << pretty_si_t(need_to_free) << " ("
<< pretty_si_t(free_meta) << " + "
<< pretty_si_t(free_buffer) << ")"
<< " -> max " << max_onodes << " onodes + "
<< max_buffer << " buffer"
<< dendl;
_trim(max_onodes, max_buffer);
}
_trim函数前半部分是数据的cache,这部分不关注略去
void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max)
{
dout(20) << __func__ << " onodes " << onode_lru.size() << " / " << onode_max
<< " buffers " << buffer_bytes << " / " << buffer_max
<< dendl;
_audit("trim start");
···
// onodes
//当lru的大小大于onode的最大值进行trim
int num = onode_lru.size() - onode_max;
if (num <= 0)
return; // don't even try
//从后往前trim,因为队尾是较久未访问的数据
auto p = onode_lru.end();
assert(p != onode_lru.begin());
--p;
int skipped = 0;
int max_skipped = g_conf->bluestore_cache_trim_max_skip_pinned;
while (num > 0) {
Onode *o = &*p;
dout(20) << __func__ << " considering " << o << dendl;
int refs = o->nref.load();
//查看是否有其他人在使用
if (refs > 1) {
dout(20) << __func__ << " " << o->oid << " has " << refs
<< " refs; skipping" << dendl;
//这部分达到最大跳过数就结束trim,我认为本处的目的是为了尽快结束trim,
//避免影响主流程io,因为在主流程和本处trim时操作lru和相关数据时
//都加了锁std::lock_guard<std::recursive_mutex> l(lock);
if (++skipped >= max_skipped) {
dout(20) << __func__ << " maximum skip pinned reached; stopping with "
<< num << " left to trim" << dendl;
break;
}
if (p == onode_lru.begin()) {
break;
} else {
p--;
num--;
continue;
}
}
dout(30) << __func__ << " " << o->oid << " num=" << num <<" lru size="<<onode_lru.size()<< dendl;
//从lru上摘除
if (p != onode_lru.begin()) {
onode_lru.erase(p--);
} else {
onode_lru.erase(p);
assert(num == 1);
}
o->get(); // paranoia
//从onode_map中删除
o->c->onode_map.remove(o->oid);
o->put();
--num;
}
}