osd收集stats数据
代码位置:OSD.cc::collect_pg_stats()
mgr接受report
bool DaemonServer::ms_dispatch(Message *m)
{
switch (m->get_type()) {
case MSG_PGSTATS:
cluster_state.ingest_pgstats(static_cast<MPGStats*>(m)); //填充PGMap::Incremental *pending_inc
maybe_ready(m->get_source().num()); //收集完所有osd数据开始整合pg到pool,然后发送报告给Mon
m->put();
return true;
。。。
};
}
等待所有osd报告完成发送报告
void DaemonServer::maybe_ready(int32_t osd_id)
{
if (pgmap_ready.load()) {
// Fast path: we don't need to take lock because pgmap_ready
// is already set
} else {
std::lock_guard l(lock);
if (reported_osds.find(osd_id) == reported_osds.end()) {
dout(4) << "initial report from osd " << osd_id << dendl;
reported_osds.insert(osd_id);
std::set<int32_t> up_osds;
cluster_state.with_osdmap([&](const OSDMap& osdmap) {
osdmap.get_up_osds(up_osds);
});
std::set<int32_t> unreported_osds;
std::set_difference(up_osds.begin(), up_osds.end(),
reported_osds.begin(), reported_osds.end(),
std::inserter(unreported_osds, unreported_osds.begin()));
if (unreported_osds.size() == 0) { //所有osd都已报告完成
dout(4) << "all osds have reported, sending PG state to mon" << dendl;
pgmap_ready = true;
reported_osds.clear();
// Avoid waiting for next tick
send_report(); //发送报告给Mon
} else {
dout(4) << "still waiting for " << unreported_osds.size() << " osds"
" to report in before PGMap is ready" << dendl;
}
}
}
}
发送报告
void DaemonServer::send_report()
{
if (!pgmap_ready) {
if (ceph_clock_now() - started_at > g_conf().get_val<int64_t>("mgr_stats_period") * 4.0) { //mgr服务启动时间超过("mgr_stats_period") * 4.0还有osd没有报告,强行提交当前更新
pgmap_ready = true;
reported_osds.clear();
dout(1) << "Giving up on OSDs that haven't reported yet, sending "
<< "potentially incomplete PG state to mon" << dendl;
} else {
dout(1) << "Not sending PG status to monitor yet, waiting for OSDs"
<< dendl;
return;
}
}
......
cluster_state.with_mutable_pgmap([&](PGMap& pg_map) {
cluster_state.update_delta_stats(); //整合pg,pool,计算增量
......
});
......
monc->send_mon_message(m); //报告给Mon
}
获取时间戳
void ClusterState::update_delta_stats()
{
pending_inc.stamp = ceph_clock_now(); //获取时间戳
pending_inc.version = pg_map.version + 1; // to make apply_incremental happy
dout(10) << " v" << pending_inc.version << dendl;
dout(30) << " pg_map before:\n";
JSONFormatter jf(true);
jf.dump_object("pg_map", pg_map);
jf.flush(*_dout);
*_dout << dendl;
dout(30) << " incremental:\n";
JSONFormatter jf(true);
jf.dump_object("pending_inc", pending_inc);
jf.flush(*_dout);
*_dout << dendl;
pg_map.apply_incremental(g_ceph_context, pending_inc); //从incremental获取数据,更新pgmap
pending_inc = PGMap::Incremental(); //重置pending_inc
}
开始更新
void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
{
ceph_assert(inc.version == version+1);
version++;
pool_stat_t pg_sum_old = pg_sum; //pg stats总和
mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old;
pg_pool_sum_old = pg_pool_sum; //pool stats 总和
for (auto p = inc.pg_stat_updates.begin();
p != inc.pg_stat_updates.end();
++p) {
const pg_t &update_pg(p->first); //pgid
auto update_pool = update_pg.pool(); //pool
const pg_stat_t &update_stat(p->second);
auto pg_stat_iter = pg_stat.find(update_pg);
pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool]; //获取当前pg所属pool的stat总和
if (pg_stat_iter == pg_stat.end()) {//不存在该pg
pg_stat.insert(make_pair(update_pg, update_stat));
} else {
stat_pg_sub(update_pg, pg_stat_iter->second); //减去旧的pg_stat
pool_sum_ref.sub(pg_stat_iter->second); //从当前pg所属pool的stat总和减去旧的pg_stat
pg_stat_iter->second = update_stat; //刷新列表中的pg_stat
}
stat_pg_add(update_pg, update_stat); //添加新的pg_stat
pool_sum_ref.add(update_stat); //从当前pg所属pool的stat总和加上新的pg_stat
}
for (auto p = inc.pool_statfs_updates.begin(); //处理pool
p != inc.pool_statfs_updates.end();
++p) {
auto update_pool = p->first.first; //pool id
auto update_osd = p->first.second; //osd id
auto& statfs_inc = p->second;
auto pool_statfs_iter =
pool_statfs.find(std::make_pair(update_pool, update_osd));
if (pg_pool_sum.count(update_pool)) { //count()返回:1,存在;0,不存在
pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
if (pool_statfs_iter == pool_statfs.end()) { //不存在
pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc);
} else {
pool_sum_ref.sub(pool_statfs_iter->second); //去掉旧的
pool_statfs_iter->second = statfs_inc; //刷新列表
}
pool_sum_ref.add(statfs_inc); //加上新的
}
}
for (auto p = inc.get_osd_stat_updates().begin();
p != inc.get_osd_stat_updates().end();
++p) {
int osd = p->first;
const osd_stat_t &new_stats(p->second);
auto t = osd_stat.find(osd);
if (t == osd_stat.end()) {
osd_stat.insert(make_pair(osd, new_stats));
} else {
stat_osd_sub(t->first, t->second);
t->second = new_stats;
}
stat_osd_add(osd, new_stats);
}
set<int64_t> deleted_pools;
for (auto p = inc.pg_remove.begin();
p != inc.pg_remove.end();
++p) {
const pg_t &removed_pg(*p);
auto s = pg_stat.find(removed_pg);
bool pool_erased = false;
if (s != pg_stat.end()) {
pool_erased = stat_pg_sub(removed_pg, s->second);
pg_stat.erase(s);
if (pool_erased) {
deleted_pools.insert(removed_pg.pool());
}
}
}
for (auto p = inc.get_osd_stat_rm().begin();
p != inc.get_osd_stat_rm().end();
++p) {
auto t = osd_stat.find(*p);
if (t != osd_stat.end()) {
stat_osd_sub(t->first, t->second);
osd_stat.erase(t);
}
for (auto i = pool_statfs.begin(); i != pool_statfs.end(); ++i) {
if (i->first.second == *p) {
pg_pool_sum[i->first.first].sub(i->second);
pool_statfs.erase(i);
}
}
}
// skip calculating delta while sum was not synchronized
if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
utime_t delta_t; //当前时间增量
delta_t = inc.stamp;
delta_t -= stamp;
// calculate a delta, and average over the last 2 deltas.
pool_stat_t d = pg_sum; //所有pg总和
d.stats.sub(pg_sum_old.stats);
pg_sum_deltas.push_back(make_pair(d, delta_t));
stamp_delta += delta_t;
pg_sum_delta.stats.add(d.stats);
auto smooth_intervals =
cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
while (pg_sum_deltas.size() > smooth_intervals) {
pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
stamp_delta -= pg_sum_deltas.front().second;
pg_sum_deltas.pop_front();
}
}
stamp = inc.stamp; //刷新pgmap时间戳
update_pool_deltas(cct, inc.stamp, pg_pool_sum_old); //计算增量
for (auto p : deleted_pools) {
if (cct)
dout(20) << " deleted pool " << p << dendl;
deleted_pool(p);
}
if (inc.osdmap_epoch)
last_osdmap_epoch = inc.osdmap_epoch;
if (inc.pg_scan)
last_pg_scan = inc.pg_scan;
}
对pool计算增量
void PGMap::update_pool_deltas(
CephContext *cct, const utime_t ts,
const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old)
{
for (auto it = pg_pool_sum_old.begin();
it != pg_pool_sum_old.end(); ++it) {
update_one_pool_delta(cct, ts, it->first, it->second);
}
}
......
void PGMap::update_one_pool_delta(
CephContext *cct,
const utime_t ts,
const int64_t pool,
const pool_stat_t& old_pool_sum)
{
if (per_pool_sum_deltas.count(pool) == 0) {
ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0);
ceph_assert(per_pool_sum_delta.count(pool) == 0);
}
auto& sum_delta = per_pool_sum_delta[pool]; //每个pool的增量和
update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
&sum_delta.first, &per_pool_sum_deltas_stamps[pool],
&per_pool_sum_deltas[pool]);
}
计算n个增量
/**
* update aggregated delta
*
* @param cct ceph context
* @param ts Timestamp for the stats being delta'ed
* @param old_pool_sum Previous stats sum
* @param last_ts Last timestamp for pool
* @param result_pool_sum Resulting stats
* @param result_pool_delta Resulting pool delta
* @param result_ts_delta Resulting timestamp delta
* @param delta_avg_list List of last N computed deltas, used to average
*/
void PGMap::update_delta(
CephContext *cct,
const utime_t ts, //本次报告时间戳
const pool_stat_t& old_pool_sum, //上次的pool stat总和
utime_t *last_ts, //上次的时间戳
const pool_stat_t& current_pool_sum, //当前统计的pool stat总和
pool_stat_t *result_pool_delta, //每个pool保存6次stat增量和
utime_t *result_ts_delta, //6次时间增量和
mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list //保存6次增量
)
{
/* @p ts is the timestamp we want to associate with the data
* in @p old_pool_sum, and on which we will base ourselves to
* calculate the delta, stored in 'delta_t'.
*/
utime_t delta_t; //存储本次的时间增量
delta_t = ts; // start with the provided timestamp
delta_t -= *last_ts; // take the last timestamp we saw
*last_ts = ts; // @p ts becomes the last timestamp we saw
// adjust delta_t, quick start if there is no update in a long period
delta_t = std::min(delta_t,
utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));
// calculate a delta, and average over the last 6 deltas by default.
/* start by taking a copy of our current @p result_pool_sum, and by
* taking out the stats from @p old_pool_sum. This generates a stats
* delta. Stash this stats delta in @p delta_avg_list, along with the
* timestamp delta for these results.
*/
pool_stat_t d = current_pool_sum;
d.stats.sub(old_pool_sum.stats); //当前stat 总和减去上次stat总和,得到stat增量
/* Aggregate current delta, and take out the last seen delta (if any) to
* average it out.
* Skip calculating delta while sum was not synchronized.
*/
if(!old_pool_sum.stats.sum.is_zero()) {
delta_avg_list->push_back(make_pair(d,delta_t));
*result_ts_delta += delta_t; //加上本次时间增量
result_pool_delta->stats.add(d.stats); //加上本次stat增量
}
size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
while (delta_avg_list->size() > s) { //s默认为6,保存6个增量,以求增量平均值
result_pool_delta->stats.sub(delta_avg_list->front().first.stats); //减去6次增量中最前面的stat增量
*result_ts_delta -= delta_avg_list->front().second; //减去6次增量中最前面的时间增量
delta_avg_list->pop_front(); //增量列表删除前面增量
}
}
iostat获取
void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
Formatter *f,
stringstream *rs)
{
......
pool_client_io_rate_summary(f, &rss, poolid);
......
}
void PGMapDigest::pool_client_io_rate_summary(Formatter *f, ostream *out,
uint64_t poolid) const
{
auto p = per_pool_sum_delta.find(poolid);
if (p == per_pool_sum_delta.end())
return;
auto ts = per_pool_sum_deltas_stamps.find(p->first); //6次时间增量和
ceph_assert(ts != per_pool_sum_deltas_stamps.end());
client_io_rate_summary(f, out, p->second.first, ts->second);
}
void PGMapDigest::client_io_rate_summary(Formatter *f, ostream *out,
const pool_stat_t& delta_sum,
utime_t delta_stamp) const
{
pool_stat_t pos_delta = delta_sum;
pos_delta.floor(0);
if (pos_delta.stats.sum.num_rd ||
pos_delta.stats.sum.num_wr) {
if (pos_delta.stats.sum.num_rd) {
int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp; //平均读带宽
if (f) {
f->dump_int("read_bytes_sec", rd);
} else {
*out << byte_u_t(rd) << "/s rd, ";
}
}
if (pos_delta.stats.sum.num_wr) {
int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp; //平均写带宽
if (f) {
f->dump_int("write_bytes_sec", wr);
} else {
*out << byte_u_t(wr) << "/s wr, ";
}
}
int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;//平均读iops
int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;//平均写iops
if (f) {
f->dump_int("read_op_per_sec", iops_rd);
f->dump_int("write_op_per_sec", iops_wr);
} else {
*out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr";
}
}
}