ceph集群整体iostat数据统计流程

osd收集stats数据

代码位置:OSD.cc::collect_pg_stats()

mgr接受report

bool DaemonServer::ms_dispatch(Message *m)
{
  switch (m->get_type()) {
    case MSG_PGSTATS:
      cluster_state.ingest_pgstats(static_cast<MPGStats*>(m));	//填充PGMap::Incremental *pending_inc
      maybe_ready(m->get_source().num());	//收集完所有osd数据开始整合pg到pool,然后发送报告给Mon
      m->put();
      return true;
。。。
  };
}

等待所有osd报告完成发送报告

void DaemonServer::maybe_ready(int32_t osd_id)
{
  if (pgmap_ready.load()) {
    // Fast path: we don't need to take lock because pgmap_ready
    // is already set
  } else {
    std::lock_guard l(lock);

    if (reported_osds.find(osd_id) == reported_osds.end()) {
      dout(4) << "initial report from osd " << osd_id << dendl;
      reported_osds.insert(osd_id);
      std::set<int32_t> up_osds;

      cluster_state.with_osdmap([&](const OSDMap& osdmap) {
          osdmap.get_up_osds(up_osds);
      });

      std::set<int32_t> unreported_osds;
      std::set_difference(up_osds.begin(), up_osds.end(),
                          reported_osds.begin(), reported_osds.end(),
                          std::inserter(unreported_osds, unreported_osds.begin()));

      if (unreported_osds.size() == 0) {  //所有osd都已报告完成
        dout(4) << "all osds have reported, sending PG state to mon" << dendl;
        pgmap_ready = true;
        reported_osds.clear();
        // Avoid waiting for next tick
        send_report();		//发送报告给Mon
      } else {
        dout(4) << "still waiting for " << unreported_osds.size() << " osds"
                   " to report in before PGMap is ready" << dendl;
      }
    }
  }
}

发送报告

void DaemonServer::send_report()
{
  if (!pgmap_ready) {
    if (ceph_clock_now() - started_at > g_conf().get_val<int64_t>("mgr_stats_period") * 4.0) {	//mgr服务启动时间超过("mgr_stats_period") * 4.0还有osd没有报告,强行提交当前更新
      pgmap_ready = true;
      reported_osds.clear();
      dout(1) << "Giving up on OSDs that haven't reported yet, sending "
              << "potentially incomplete PG state to mon" << dendl;
    } else {
      dout(1) << "Not sending PG status to monitor yet, waiting for OSDs"
              << dendl;
      return;
    }
  }
      ......
  cluster_state.with_mutable_pgmap([&](PGMap& pg_map) {
      cluster_state.update_delta_stats();	//整合pg,pool,计算增量
     ......
    });
	......
  monc->send_mon_message(m);	//报告给Mon
}

获取时间戳

void ClusterState::update_delta_stats()
{
  pending_inc.stamp = ceph_clock_now();		//获取时间戳
  pending_inc.version = pg_map.version + 1; // to make apply_incremental happy
  dout(10) << " v" << pending_inc.version << dendl;

  dout(30) << " pg_map before:\n";
  JSONFormatter jf(true);
  jf.dump_object("pg_map", pg_map);
  jf.flush(*_dout);
  *_dout << dendl;
  dout(30) << " incremental:\n";
  JSONFormatter jf(true);
  jf.dump_object("pending_inc", pending_inc);
  jf.flush(*_dout);
  *_dout << dendl;
  pg_map.apply_incremental(g_ceph_context, pending_inc);	//从incremental获取数据,更新pgmap
  pending_inc = PGMap::Incremental();	//重置pending_inc 
}

开始更新

void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
{
  ceph_assert(inc.version == version+1);
  version++;

  pool_stat_t pg_sum_old = pg_sum;	//pg stats总和
  mempool::pgmap::unordered_map<int32_t, pool_stat_t> pg_pool_sum_old;
  pg_pool_sum_old = pg_pool_sum;	//pool stats 总和

  for (auto p = inc.pg_stat_updates.begin();
       p != inc.pg_stat_updates.end();
       ++p) {
    const pg_t &update_pg(p->first);  //pgid
    auto update_pool = update_pg.pool();  //pool
    const pg_stat_t &update_stat(p->second);

    auto pg_stat_iter = pg_stat.find(update_pg);
    pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];	//获取当前pg所属pool的stat总和
    if (pg_stat_iter == pg_stat.end()) {//不存在该pg
      pg_stat.insert(make_pair(update_pg, update_stat));	
    } else {
      stat_pg_sub(update_pg, pg_stat_iter->second);	//减去旧的pg_stat
      pool_sum_ref.sub(pg_stat_iter->second);	//从当前pg所属pool的stat总和减去旧的pg_stat
      pg_stat_iter->second = update_stat;	//刷新列表中的pg_stat
    }
    stat_pg_add(update_pg, update_stat);	//添加新的pg_stat
    pool_sum_ref.add(update_stat);	//从当前pg所属pool的stat总和加上新的pg_stat
  }

  for (auto p = inc.pool_statfs_updates.begin();	//处理pool
       p != inc.pool_statfs_updates.end();
       ++p) {
    auto update_pool = p->first.first;  //pool id 
    auto update_osd =  p->first.second;	//osd id
    auto& statfs_inc = p->second;

    auto pool_statfs_iter =
      pool_statfs.find(std::make_pair(update_pool, update_osd));
    if (pg_pool_sum.count(update_pool)) {  //count()返回:1,存在;0,不存在
      pool_stat_t &pool_sum_ref = pg_pool_sum[update_pool];
      if (pool_statfs_iter == pool_statfs.end()) { //不存在
        pool_statfs.emplace(std::make_pair(update_pool, update_osd), statfs_inc);
      } else {
        pool_sum_ref.sub(pool_statfs_iter->second);	//去掉旧的
        pool_statfs_iter->second = statfs_inc;	//刷新列表
      }
      pool_sum_ref.add(statfs_inc);	//加上新的
    }
  }

  for (auto p = inc.get_osd_stat_updates().begin();
       p != inc.get_osd_stat_updates().end();
       ++p) {
    int osd = p->first;
    const osd_stat_t &new_stats(p->second);

    auto t = osd_stat.find(osd);
    if (t == osd_stat.end()) {
      osd_stat.insert(make_pair(osd, new_stats));
    } else {
      stat_osd_sub(t->first, t->second);
      t->second = new_stats;
    }
    stat_osd_add(osd, new_stats);
  }
  set<int64_t> deleted_pools;
  for (auto p = inc.pg_remove.begin();
       p != inc.pg_remove.end();
       ++p) {
    const pg_t &removed_pg(*p);
    auto s = pg_stat.find(removed_pg);
    bool pool_erased = false;
    if (s != pg_stat.end()) {
      pool_erased = stat_pg_sub(removed_pg, s->second);
      pg_stat.erase(s);
      if (pool_erased) {
        deleted_pools.insert(removed_pg.pool());
      }
    }
  }

  for (auto p = inc.get_osd_stat_rm().begin();
       p != inc.get_osd_stat_rm().end();
       ++p) {
    auto t = osd_stat.find(*p);
    if (t != osd_stat.end()) {
      stat_osd_sub(t->first, t->second);
      osd_stat.erase(t);
    }
    for (auto i = pool_statfs.begin();  i != pool_statfs.end(); ++i) {
      if (i->first.second == *p) {
	pg_pool_sum[i->first.first].sub(i->second);
	pool_statfs.erase(i);
      }
    }
  }

  // skip calculating delta while sum was not synchronized
  if (!stamp.is_zero() && !pg_sum_old.stats.sum.is_zero()) {
    utime_t delta_t;	//当前时间增量
    delta_t = inc.stamp;
    delta_t -= stamp;
    // calculate a delta, and average over the last 2 deltas.
    pool_stat_t d = pg_sum;		//所有pg总和
    d.stats.sub(pg_sum_old.stats);
    pg_sum_deltas.push_back(make_pair(d, delta_t));	
    stamp_delta += delta_t;
    pg_sum_delta.stats.add(d.stats);
    auto smooth_intervals =
      cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
    while (pg_sum_deltas.size() > smooth_intervals) {
      pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
      stamp_delta -= pg_sum_deltas.front().second;
      pg_sum_deltas.pop_front();
    }
  }
  stamp = inc.stamp;	//刷新pgmap时间戳

  update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);	//计算增量

  for (auto p : deleted_pools) {
    if (cct)
      dout(20) << " deleted pool " << p << dendl;
    deleted_pool(p);
  }

  if (inc.osdmap_epoch)
    last_osdmap_epoch = inc.osdmap_epoch;
  if (inc.pg_scan)
    last_pg_scan = inc.pg_scan;
}

对pool计算增量

void PGMap::update_pool_deltas(
  CephContext *cct, const utime_t ts,
  const mempool::pgmap::unordered_map<int32_t,pool_stat_t>& pg_pool_sum_old)
{
  for (auto it = pg_pool_sum_old.begin();
       it != pg_pool_sum_old.end(); ++it) {
    update_one_pool_delta(cct, ts, it->first, it->second);
  }
}
......

void PGMap::update_one_pool_delta(
  CephContext *cct,
  const utime_t ts,
  const int64_t pool,
  const pool_stat_t& old_pool_sum)
{
  if (per_pool_sum_deltas.count(pool) == 0) {
    ceph_assert(per_pool_sum_deltas_stamps.count(pool) == 0);
    ceph_assert(per_pool_sum_delta.count(pool) == 0);
  }

  auto& sum_delta = per_pool_sum_delta[pool];	//每个pool的增量和

  update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
               &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
               &per_pool_sum_deltas[pool]);
}

计算n个增量

/**
 * update aggregated delta
 *
 * @param cct               ceph context
 * @param ts                Timestamp for the stats being delta'ed
 * @param old_pool_sum      Previous stats sum
 * @param last_ts           Last timestamp for pool
 * @param result_pool_sum   Resulting stats
 * @param result_pool_delta Resulting pool delta
 * @param result_ts_delta   Resulting timestamp delta
 * @param delta_avg_list    List of last N computed deltas, used to average
 */
void PGMap::update_delta(
  CephContext *cct,
  const utime_t ts,						//本次报告时间戳
  const pool_stat_t& old_pool_sum,		//上次的pool stat总和
  utime_t *last_ts,						//上次的时间戳
  const pool_stat_t& current_pool_sum,	//当前统计的pool stat总和
  pool_stat_t *result_pool_delta,		//每个pool保存6次stat增量和
  utime_t *result_ts_delta,				//6次时间增量和
  mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list //保存6次增量
  )
{
  /* @p ts is the timestamp we want to associate with the data
   * in @p old_pool_sum, and on which we will base ourselves to
   * calculate the delta, stored in 'delta_t'.
   */
  utime_t delta_t;	//存储本次的时间增量
  delta_t = ts;         // start with the provided timestamp
  delta_t -= *last_ts;  // take the last timestamp we saw
  *last_ts = ts;        // @p ts becomes the last timestamp we saw

  // adjust delta_t, quick start if there is no update in a long period
  delta_t = std::min(delta_t,
                    utime_t(2 * (cct ? cct->_conf->mon_delta_reset_interval : 10), 0));

  // calculate a delta, and average over the last 6 deltas by default.
  /* start by taking a copy of our current @p result_pool_sum, and by
   * taking out the stats from @p old_pool_sum.  This generates a stats
   * delta.  Stash this stats delta in @p delta_avg_list, along with the
   * timestamp delta for these results.
   */
  pool_stat_t d = current_pool_sum;
  d.stats.sub(old_pool_sum.stats);	//当前stat 总和减去上次stat总和,得到stat增量

  /* Aggregate current delta, and take out the last seen delta (if any) to
   * average it out.
   * Skip calculating delta while sum was not synchronized.
   */
  if(!old_pool_sum.stats.sum.is_zero()) {
    delta_avg_list->push_back(make_pair(d,delta_t));
    *result_ts_delta += delta_t;	//加上本次时间增量
    result_pool_delta->stats.add(d.stats);	//加上本次stat增量
  }
  size_t s = cct ? cct->_conf.get_val<uint64_t>("mon_stat_smooth_intervals") : 1;
  while (delta_avg_list->size() > s) {	//s默认为6,保存6个增量,以求增量平均值
    result_pool_delta->stats.sub(delta_avg_list->front().first.stats); //减去6次增量中最前面的stat增量
    *result_ts_delta -= delta_avg_list->front().second; //减去6次增量中最前面的时间增量
    delta_avg_list->pop_front();	//增量列表删除前面增量
  }
}

iostat获取

void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
                                        Formatter *f,
                                        stringstream *rs)
 {
 	......
   pool_client_io_rate_summary(f, &rss, poolid);
    ......
}
void PGMapDigest::pool_client_io_rate_summary(Formatter *f, ostream *out,
                                        uint64_t poolid) const
{
  auto p = per_pool_sum_delta.find(poolid);
  if (p == per_pool_sum_delta.end())
    return;

  auto ts = per_pool_sum_deltas_stamps.find(p->first);	//6次时间增量和
  ceph_assert(ts != per_pool_sum_deltas_stamps.end());
  client_io_rate_summary(f, out, p->second.first, ts->second);
}
void PGMapDigest::client_io_rate_summary(Formatter *f, ostream *out,
                                   const pool_stat_t& delta_sum,
                                   utime_t delta_stamp) const
{
  pool_stat_t pos_delta = delta_sum;
  pos_delta.floor(0);
  if (pos_delta.stats.sum.num_rd ||
      pos_delta.stats.sum.num_wr) {
    if (pos_delta.stats.sum.num_rd) {
      int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;	//平均读带宽
      if (f) {
	f->dump_int("read_bytes_sec", rd);
      } else {
	*out << byte_u_t(rd) << "/s rd, ";
      }
    }
    if (pos_delta.stats.sum.num_wr) {
      int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;	//平均写带宽
      if (f) {
	f->dump_int("write_bytes_sec", wr);
      } else {
	*out << byte_u_t(wr) << "/s wr, ";
      }
    }
    int64_t iops_rd = pos_delta.stats.sum.num_rd / (double)delta_stamp;//平均读iops
    int64_t iops_wr = pos_delta.stats.sum.num_wr / (double)delta_stamp;//平均写iops
    if (f) {
      f->dump_int("read_op_per_sec", iops_rd);
      f->dump_int("write_op_per_sec", iops_wr);
    } else {
      *out << si_u_t(iops_rd) << " op/s rd, " << si_u_t(iops_wr) << " op/s wr";
    }
  }
}

完结

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值