librados::IoCtxImpl::write写流程分析

本文介绍Ceph客户端方面的某些模块的实现。客户端主要是实现了接口,对外提供访问的功能。上层可以通过接口来访问Ceph存储。Librados 与 Osdc 位于Ceph客户端中比较底层的位置,Librados 提供了Pool的创建、删除、对象的创建、删除等基本接口;Osdc则用于封装操作,计算对象的地址,发送请求和处理超时。如图:

这里写图片描述

根据LIBRADOS架构图,叙述大概的事件流程。在 Ceph分布式存储实战中 这本书中有如下一段话:
先根据配置文件调用LIBRADOS创建一个RADOS,接下来为这个RADOS创建一个radosclient,radosclient包含3个主要模块(finisher、Message、Objector)。再根据pool创建对应的ioctx,在ioctx中能够找到radosclient。在调用OSDC生成对应的OSD请求,与OSD进行通信响应请求。这从大体上叙述了librados与osdc在整个Ceph中的作用。

Librados

该模块包含两个部分,分别是RadosClient 模块和IoctxImpl。RadosClient处于最上层,是librados的核心管理类,管理着整个RADOS系统层面以及pool层面的管理。而IoctxImpl则对于其中的某一个pool进行管理,如对 对象的读写等操作的控制。
RadosClient(Librados模块)
IoctxImpl(Librados模块)
Objecter(Osdc 模块)

RadosClient类

class librados::RadosClient : public Dispatcher,
			      public md_config_obs_t
{
  friend neorados::detail::RadosClient;
public:
  using Dispatcher::cct;
private:
  std::unique_ptr<CephContext,
		  std::function<void(CephContext*)>> cct_deleter;

public:
  const ConfigProxy& conf{cct->_conf};
  ceph::async::io_context_pool poolctx;
private:
  enum {
    DISCONNECTED,
    CONNECTING,
    CONNECTED,
  } state{DISCONNECTED};

  MonClient monclient{cct, poolctx};
  MgrClient mgrclient{cct, nullptr, &monclient.monmap};
  Messenger *messenger{nullptr}; //网络消息接口

  uint64_t instance_id{0};

  bool _dispatch(Message *m); //相关消息分发 Dispatcher类的函数重写
  bool ms_dispatch(Message *m) override;

  void ms_handle_connect(Connection *con) override;
  bool ms_handle_reset(Connection *con) override;
  void ms_handle_remote_reset(Connection *con) override;
  bool ms_handle_refused(Connection *con) override;

  Objecter *objecter{nullptr};// Osdc模块中的 用于发送封装好的OP消息

  ceph::mutex lock = ceph::make_mutex("librados::RadosClient::lock");
  ceph::condition_variable cond;
  int refcnt{1};

  version_t log_last_version{0};
  rados_log_callback_t log_cb{nullptr};
  rados_log_callback2_t log_cb2{nullptr};
  void *log_cb_arg{nullptr};
  std::string log_watch;

  bool service_daemon = false;
  std::string daemon_name, service_name;
  std::map<std::string,std::string> daemon_metadata;
  ceph::timespan rados_mon_op_timeout{};

  int wait_for_osdmap();

public:
  boost::asio::io_context::strand finish_strand{poolctx.get_io_context()};

  explicit RadosClient(CephContext *cct);
  ~RadosClient() override;
  int ping_monitor(std::string mon_id, std::string *result);
  int connect();
  void shutdown();

  int watch_flush();
  int async_watch_flush(AioCompletionImpl *c);

  uint64_t get_instance_id();

  int get_min_compatible_osd(int8_t* require_osd_release);
  int get_min_compatible_client(int8_t* min_compat_client,
                                int8_t* require_min_compat_client);

  int wait_for_latest_osdmap();

  int create_ioctx(const char *name, IoCtxImpl **io);
  int create_ioctx(int64_t, IoCtxImpl **io);

  int get_fsid(std::string *s);
  int64_t lookup_pool(const char *name);
  bool pool_requires_alignment(int64_t pool_id);
  int pool_requires_alignment2(int64_t pool_id, bool *requires);
  uint64_t pool_required_alignment(int64_t pool_id);
  int pool_required_alignment2(int64_t pool_id, uint64_t *alignment);
  int pool_get_name(uint64_t pool_id, std::string *name,
		    bool wait_latest_map = false);

  int pool_list(std::list<std::pair<int64_t, std::string> >& ls);
  int get_pool_stats(std::list<std::string>& ls, std::map<std::string,::pool_stat_t> *result,
    bool *per_pool);
  int get_fs_stats(ceph_statfs& result);
  bool get_pool_is_selfmanaged_snaps_mode(const std::string& pool);

  /*
  -1 was set as the default value and monitor will pickup the right crush rule with below order:
    a) osd pool default crush replicated rule
    b) the first rule
    c) error out if no value find
  */
  int pool_create(std::string& name, int16_t crush_rule=-1);
  int pool_create_async(std::string& name, PoolAsyncCompletionImpl *c,
			int16_t crush_rule=-1);
  int pool_get_base_tier(int64_t pool_id, int64_t* base_tier);
  int pool_delete(const char *name);

  int pool_delete_async(const char *name, PoolAsyncCompletionImpl *c);

  int blocklist_add(const std::string& client_address, uint32_t expire_seconds);

  int mon_command(const std::vector<std::string>& cmd, const bufferlist &inbl,
	          bufferlist *outbl, std::string *outs);
  void mon_command_async(const std::vector<std::string>& cmd, const bufferlist &inbl,
                         bufferlist *outbl, std::string *outs, Context *on_finish);
  int mon_command(int rank,
		  const std::vector<std::string>& cmd, const bufferlist &inbl,
	          bufferlist *outbl, std::string *outs);
  int mon_command(std::string name,
		  const std::vector<std::string>& cmd, const bufferlist &inbl,
	          bufferlist *outbl, std::string *outs);
  int mgr_command(const std::vector<std::string>& cmd, const bufferlist &inbl,
	          bufferlist *outbl, std::string *outs);
  int mgr_command(
    const std::string& name,
    const std::vector<std::string>& cmd, const bufferlist &inbl,
    bufferlist *outbl, std::string *outs);
  int osd_command(int osd, std::vector<std::string>& cmd, const bufferlist& inbl,
                  bufferlist *poutbl, std::string *prs);
  int pg_command(pg_t pgid, std::vector<std::string>& cmd, const bufferlist& inbl,
	         bufferlist *poutbl, std::string *prs);

  void handle_log(MLog *m);
  int monitor_log(const std::string& level, rados_log_callback_t cb,
		  rados_log_callback2_t cb2, void *arg);

  void get();
  bool put();
  void blocklist_self(bool set);

  std::string get_addrs() const;

  int service_daemon_register(
    const std::string& service,  ///< service name (e.g., 'rgw')
    const std::string& name,     ///< daemon name (e.g., 'gwfoo')
    const std::map<std::string,std::string>& metadata); ///< static metadata about daemon
  int service_daemon_update_status(
    std::map<std::string,std::string>&& status);

  mon_feature_t get_required_monitor_features() const;

  int get_inconsistent_pgs(int64_t pool_id, std::vector<std::string>* pgs);
  const char** get_tracked_conf_keys() const override;
  void handle_conf_change(const ConfigProxy& conf,
                          const std::set <std::string> &changed) override;
};

IoctxImpl类

该类是pool的上下文信息,一个pool对应一个IoctxImpl对象。librados中所有关于IO操作的API都设计在librados::IoCtx中,接口的真正实现在IoCtxImpl中。它的处理过程如下:
1)把请求封装成ObjectOperation 类(osdc 中的)
2)把相关的pool信息添加到里面,封装成Objecter::Op对像
3)调用相应的函数 objecter- >op_submit 发送给相应的OSD
4)操作完成后,调用相应的回调函数。

OSDC

该模块是客户端模块比较底层的,模块,用于封装操作数据,计算对象的地址、发送请求和处理超时。

客户端读写操作分析

librados::IoCtxImpl::write
	librados::IoCtxImpl::operate
		Objecter::prepare_mutate_op
			Objecter::op_submit
				Objecter::_op_submit_with_budget
					Objecter::_op_submit
						Objecter::_calc_target
						Objecter::_get_session
						Objecter::_send_op

注:
prepare_mutate_op:
调用objecter->prepare_mutate_op把ObjectOperation封装为Op类型

op_submit:
该函数将封装好的Op操作通过网络发送出去。在op_submit中调用了_op_submit_with_budget用来处理Throttle相关的流量信息以及超时处理,最后该函数调用 _op_submit用来完成关键地址寻址和发送工作。

_calc_target:
调用_calc_target 来计算目标OSD

_get_session:
调用函数 _get_session 获取目标OSD的连接,如果返回-EAGAIN,就升级为写锁,重新获取

write

写操作消息封装

int librados::IoCtxImpl::write(const object_t& oid, bufferlist& bl,
			       size_t len, uint64_t off)
{
  if (len > UINT_MAX/2)
    return -E2BIG;
  ::ObjectOperation op;
  prepare_assert_ops(&op);
  bufferlist mybl;
  mybl.substr_of(bl, 0, len);
  op.write(off, mybl);
  return operate(oid, &op, NULL); //调用operate处理
}

operate

int librados::IoCtxImpl::operate(const object_t& oid, ::ObjectOperation *o,
				 ceph::real_time *pmtime, int flags)
{
  ceph::real_time ut = (pmtime ? *pmtime :
    ceph::real_clock::now());

  /* can't write to a snapshot */
  if (snap_seq != CEPH_NOSNAP)
    return -EROFS;

  if (!o->size())
    return 0;

  ceph::mutex mylock = ceph::make_mutex("IoCtxImpl::operate::mylock");
  ceph::condition_variable cond;
  bool done;
  int r;
  version_t ver;

  Context *oncommit = new C_SafeCond(mylock, cond, &done, &r);

  int op = o->ops[0].op.op;
  ldout(client->cct, 10) << ceph_osd_op_name(op) << " oid=" << oid
			 << " nspace=" << oloc.nspace << dendl;
  Objecter::Op *objecter_op = objecter->prepare_mutate_op(
    oid, oloc,
    *o, snapc, ut,
    flags | extra_op_flags,
    oncommit, &ver);
  objecter->op_submit(objecter_op);

  {
    std::unique_lock l{mylock};
    cond.wait(l, [&done] { return done;});
  }
  ldout(client->cct, 10) << "Objecter returned from "
	<< ceph_osd_op_name(op) << " r=" << r << dendl;

  set_sync_op_version(ver);

  return r;
}

prepare_mutate_op

调用objecter->prepare_mutate_op把ObjectOperation封装为Op类型

 Op *prepare_mutate_op(
    const object_t& oid, const object_locator_t& oloc,
    ObjectOperation& op, const SnapContext& snapc,
    ceph::real_time mtime, int flags,
    Context *oncommit, version_t *objver = NULL,
    osd_reqid_t reqid = osd_reqid_t(),
    ZTracer::Trace *parent_trace = nullptr) {
    Op *o = new Op(oid, oloc, std::move(op.ops), flags | global_op_flags |
		   CEPH_OSD_FLAG_WRITE, oncommit, objver,
		   nullptr, parent_trace);
    o->priority = op.priority;
    o->mtime = mtime;
    o->snapc = snapc;
    o->out_rval.swap(op.out_rval);
    o->out_bl.swap(op.out_bl);
    o->out_handler.swap(op.out_handler);
    o->out_ec.swap(op.out_ec);
    o->reqid = reqid;
    op.clear();
    return o;
  }

op_submit

该函数将封装好的Op操作通过网络发送出去。在op_submit中调用了_op_submit_with_budget用来处理Throttle相关的流量信息以及超时处理,最后该函数调用 _op_submit用来完成关键地址寻址和发送工作。

void Objecter::op_submit(Op *op, ceph_tid_t *ptid, int *ctx_budget)
{
  shunique_lock rl(rwlock, ceph::acquire_shared);
  ceph_tid_t tid = 0;
  if (!ptid)
    ptid = &tid;
  op->trace.event("op submit");
  _op_submit_with_budget(op, rl, ptid, ctx_budget);
}

void Objecter::_op_submit_with_budget(Op *op,
				      shunique_lock<ceph::shared_mutex>& sul,
				      ceph_tid_t *ptid,
				      int *ctx_budget)
{
  ceph_assert(initialized);

  ceph_assert(op->ops.size() == op->out_bl.size());
  ceph_assert(op->ops.size() == op->out_rval.size());
  ceph_assert(op->ops.size() == op->out_handler.size());

  // throttle.  before we look at any state, because
  // _take_op_budget() may drop our lock while it blocks.
  if (!op->ctx_budgeted || (ctx_budget && (*ctx_budget == -1))) {
    int op_budget = _take_op_budget(op, sul);
    // take and pass out the budget for the first OP
    // in the context session
    if (ctx_budget && (*ctx_budget == -1)) {
      *ctx_budget = op_budget;
    }
  }

  if (osd_timeout > timespan(0)) {
    if (op->tid == 0)
      op->tid = ++last_tid;
    auto tid = op->tid;
    op->ontimeout = timer.add_event(osd_timeout,
				    [this, tid]() {
				      op_cancel(tid, -ETIMEDOUT); });
  }

  _op_submit(op, sul, ptid);
}

void Objecter::_op_submit(Op *op, shunique_lock<ceph::shared_mutex>& sul, ceph_tid_t *ptid)
{
  // rwlock is locked

  ldout(cct, 10) << __func__ << " op " << op << dendl;

  // pick target
  ceph_assert(op->session == NULL);
  OSDSession *s = NULL;

  bool check_for_latest_map = false;
  int r = _calc_target(&op->target, nullptr);
  switch(r) {
  case RECALC_OP_TARGET_POOL_DNE:
    check_for_latest_map = true;
    break;
  case RECALC_OP_TARGET_POOL_EIO:
    if (op->has_completion()) {
      op->complete(osdc_errc::pool_eio, -EIO);
    }
    return;
  }

  // Try to get a session, including a retry if we need to take write lock
  r = _get_session(op->target.osd, &s, sul);
  if (r == -EAGAIN ||
      (check_for_latest_map && sul.owns_lock_shared()) ||
      cct->_conf->objecter_debug_inject_relock_delay) {
    epoch_t orig_epoch = osdmap->get_epoch();
    sul.unlock();
    if (cct->_conf->objecter_debug_inject_relock_delay) {
      sleep(1);
    }
    sul.lock();
    if (orig_epoch != osdmap->get_epoch()) {
      // map changed; recalculate mapping
      ldout(cct, 10) << __func__ << " relock raced with osdmap, recalc target"
		     << dendl;
      check_for_latest_map = _calc_target(&op->target, nullptr)
	== RECALC_OP_TARGET_POOL_DNE;
      if (s) {
	put_session(s);
	s = NULL;
	r = -EAGAIN;
      }
    }
  }
  if (r == -EAGAIN) {
    ceph_assert(s == NULL);
    r = _get_session(op->target.osd, &s, sul);
  }
  ceph_assert(r == 0);
  ceph_assert(s);  // may be homeless

  _send_op_account(op);

  // send?

  ceph_assert(op->target.flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE));

  bool need_send = false;
  if (op->target.paused) {
    ldout(cct, 10) << " tid " << op->tid << " op " << op << " is paused"
		   << dendl;
    _maybe_request_map();
  } else if (!s->is_homeless()) {
    need_send = true;
  } else {
    _maybe_request_map();
  }

  unique_lock sl(s->lock);
  if (op->tid == 0)
    op->tid = ++last_tid;

  ldout(cct, 10) << "_op_submit oid " << op->target.base_oid
		 << " '" << op->target.base_oloc << "' '"
		 << op->target.target_oloc << "' " << op->ops << " tid "
		 << op->tid << " osd." << (!s->is_homeless() ? s->osd : -1)
		 << dendl;

  _session_op_assign(s, op);

  if (need_send) {
    _send_op(op);
  }

  // Last chance to touch Op here, after giving up session lock it can
  // be freed at any time by response handler.
  ceph_tid_t tid = op->tid;
  if (check_for_latest_map) {
    _send_op_map_check(op);
  }
  if (ptid)
    *ptid = tid;
  op = NULL;

  sl.unlock();
  put_session(s);

  ldout(cct, 5) << num_in_flight << " in flight" << dendl;
}

void Objecter::_send_op(Op *op)
{
  // rwlock is locked
  // op->session->lock is locked

  // backoff?
  auto p = op->session->backoffs.find(op->target.actual_pgid);
  if (p != op->session->backoffs.end()) {
    hobject_t hoid = op->target.get_hobj();
    auto q = p->second.lower_bound(hoid);
    if (q != p->second.begin()) {
      --q;
      if (hoid >= q->second.end) {
	++q;
      }
    }
    if (q != p->second.end()) {
      ldout(cct, 20) << __func__ << " ? " << q->first << " [" << q->second.begin
		     << "," << q->second.end << ")" << dendl;
      int r = cmp(hoid, q->second.begin);
      if (r == 0 || (r > 0 && hoid < q->second.end)) {
	ldout(cct, 10) << __func__ << " backoff " << op->target.actual_pgid
		       << " id " << q->second.id << " on " << hoid
		       << ", queuing " << op << " tid " << op->tid << dendl;
	return;
      }
    }
  }

  ceph_assert(op->tid > 0);
  MOSDOp *m = _prepare_osd_op(op);

  if (op->target.actual_pgid != m->get_spg()) {
    ldout(cct, 10) << __func__ << " " << op->tid << " pgid change from "
		   << m->get_spg() << " to " << op->target.actual_pgid
		   << ", updating and reencoding" << dendl;
    m->set_spg(op->target.actual_pgid);
    m->clear_payload();  // reencode
  }

  ldout(cct, 15) << "_send_op " << op->tid << " to "
		 << op->target.actual_pgid << " on osd." << op->session->osd
		 << dendl;

  ConnectionRef con = op->session->con;
  ceph_assert(con);

#if 0
  // preallocated rx ceph::buffer?
  if (op->con) {
    ldout(cct, 20) << " revoking rx ceph::buffer for " << op->tid << " on "
		   << op->con << dendl;
    op->con->revoke_rx_buffer(op->tid);
  }
  if (op->outbl &&
      op->ontimeout == 0 &&  // only post rx_buffer if no timeout; see #9582
      op->outbl->length()) {
    op->outbl->invalidate_crc();  // messenger writes through c_str()
    ldout(cct, 20) << " posting rx ceph::buffer for " << op->tid << " on " << con
		   << dendl;
    op->con = con;
    op->con->post_rx_buffer(op->tid, *op->outbl);
  }
#endif

  op->incarnation = op->session->incarnation;

  if (op->trace.valid()) {
    m->trace.init("op msg", nullptr, &op->trace);
  }
  op->session->con->send_message(m); //最终发送数据
}

_calc_target

osdmap->object_locator_to_pg获取目标对象所在的PG
osdmap->pg_to_up_acting_osd通过CRUSH算法,获取该PG对应的OSD列表

int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
{
  // rwlock is locked
  bool is_read = t->flags & CEPH_OSD_FLAG_READ;
  bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
  t->epoch = osdmap->get_epoch();
  ldout(cct,20) << __func__ << " epoch " << t->epoch
		<< " base " << t->base_oid << " " << t->base_oloc
		<< " precalc_pgid " << (int)t->precalc_pgid
		<< " pgid " << t->base_pgid
		<< (is_read ? " is_read" : "")
		<< (is_write ? " is_write" : "")
		<< dendl;

  const pg_pool_t *pi = osdmap->get_pg_pool(t->base_oloc.pool);
  if (!pi) {
    t->osd = -1;
    return RECALC_OP_TARGET_POOL_DNE;
  }

  if (pi->has_flag(pg_pool_t::FLAG_EIO)) {
    return RECALC_OP_TARGET_POOL_EIO;
  }

  ldout(cct,30) << __func__ << "  base pi " << pi
		<< " pg_num " << pi->get_pg_num() << dendl;

  bool force_resend = false;
  if (osdmap->get_epoch() == pi->last_force_op_resend) {
    if (t->last_force_resend < pi->last_force_op_resend) {
      t->last_force_resend = pi->last_force_op_resend;
      force_resend = true;
    } else if (t->last_force_resend == 0) {
      force_resend = true;
    }
  }

  // apply tiering
  t->target_oid = t->base_oid;
  t->target_oloc = t->base_oloc;
  if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
    if (is_read && pi->has_read_tier())
      t->target_oloc.pool = pi->read_tier;
    if (is_write && pi->has_write_tier())
      t->target_oloc.pool = pi->write_tier;
    pi = osdmap->get_pg_pool(t->target_oloc.pool);
    if (!pi) {
      t->osd = -1;
      return RECALC_OP_TARGET_POOL_DNE;
    }
  }

  pg_t pgid;
  if (t->precalc_pgid) {
    ceph_assert(t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY);
    ceph_assert(t->base_oid.name.empty()); // make sure this is a pg op
    ceph_assert(t->base_oloc.pool == (int64_t)t->base_pgid.pool());
    pgid = t->base_pgid;
  } else {
    int ret = osdmap->object_locator_to_pg(t->target_oid, t->target_oloc,
					   pgid);
    if (ret == -ENOENT) {
      t->osd = -1;
      return RECALC_OP_TARGET_POOL_DNE;
    }
  }
  ldout(cct,20) << __func__ << " target " << t->target_oid << " "
		<< t->target_oloc << " -> pgid " << pgid << dendl;
  ldout(cct,30) << __func__ << "  target pi " << pi
		<< " pg_num " << pi->get_pg_num() << dendl;
  t->pool_ever_existed = true;

  int size = pi->size;
  int min_size = pi->min_size;
  unsigned pg_num = pi->get_pg_num();
  unsigned pg_num_mask = pi->get_pg_num_mask();
  unsigned pg_num_pending = pi->get_pg_num_pending();
  int up_primary, acting_primary;
  vector<int> up, acting;
  ps_t actual_ps = ceph_stable_mod(pgid.ps(), pg_num, pg_num_mask);
  pg_t actual_pgid(actual_ps, pgid.pool());
  if (!lookup_pg_mapping(actual_pgid, osdmap->get_epoch(), &up, &up_primary,
                         &acting, &acting_primary)) {
    osdmap->pg_to_up_acting_osds(actual_pgid, &up, &up_primary,
                                 &acting, &acting_primary);
    pg_mapping_t pg_mapping(osdmap->get_epoch(),
                            up, up_primary, acting, acting_primary);
    update_pg_mapping(actual_pgid, std::move(pg_mapping));
  }
  bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
  bool recovery_deletes = osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES);
  unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask);
  pg_t prev_pgid(prev_seed, pgid.pool());
  if (any_change && PastIntervals::is_new_interval(
	t->acting_primary,
	acting_primary,
	t->acting,
	acting,
	t->up_primary,
	up_primary,
	t->up,
	up,
	t->size,
	size,
	t->min_size,
	min_size,
	t->pg_num,
	pg_num,
	t->pg_num_pending,
	pg_num_pending,
	t->sort_bitwise,
	sort_bitwise,
	t->recovery_deletes,
	recovery_deletes,
	t->peering_crush_bucket_count,
	pi->peering_crush_bucket_count,
	t->peering_crush_bucket_target,
	pi->peering_crush_bucket_target,
	t->peering_crush_bucket_barrier,
	pi->peering_crush_bucket_barrier,
	t->peering_crush_mandatory_member,
	pi->peering_crush_mandatory_member,
	prev_pgid)) {
    force_resend = true;
  }

  bool unpaused = false;
  bool should_be_paused = target_should_be_paused(t);
  if (t->paused && !should_be_paused) {
    unpaused = true;
  }
  if (t->paused != should_be_paused) {
    ldout(cct, 10) << __func__ << " paused " << t->paused
		   << " -> " << should_be_paused << dendl;
    t->paused = should_be_paused;
  }

  bool legacy_change =
    t->pgid != pgid ||
      is_pg_changed(
	t->acting_primary, t->acting, acting_primary, acting,
	t->used_replica || any_change);
  bool split_or_merge = false;
  if (t->pg_num) {
    split_or_merge =
      prev_pgid.is_split(t->pg_num, pg_num, nullptr) ||
      prev_pgid.is_merge_source(t->pg_num, pg_num, nullptr) ||
      prev_pgid.is_merge_target(t->pg_num, pg_num);
  }

  if (legacy_change || split_or_merge || force_resend) {
    t->pgid = pgid;
    t->acting = std::move(acting);
    t->acting_primary = acting_primary;
    t->up_primary = up_primary;
    t->up = std::move(up);
    t->size = size;
    t->min_size = min_size;
    t->pg_num = pg_num;
    t->pg_num_mask = pg_num_mask;
    t->pg_num_pending = pg_num_pending;
    spg_t spgid(actual_pgid);
    if (pi->is_erasure()) {
      for (uint8_t i = 0; i < t->acting.size(); ++i) {
        if (t->acting[i] == acting_primary) {
          spgid.reset_shard(shard_id_t(i));
          break;
        }
      }
    }
    t->actual_pgid = spgid;
    t->sort_bitwise = sort_bitwise;
    t->recovery_deletes = recovery_deletes;
    t->peering_crush_bucket_count = pi->peering_crush_bucket_count;
    t->peering_crush_bucket_target = pi->peering_crush_bucket_target;
    t->peering_crush_bucket_barrier = pi->peering_crush_bucket_barrier;
    t->peering_crush_mandatory_member = pi->peering_crush_mandatory_member;
    ldout(cct, 10) << __func__ << " "
		   << " raw pgid " << pgid << " -> actual " << t->actual_pgid
		   << " acting " << t->acting
		   << " primary " << acting_primary << dendl;
    t->used_replica = false;
    if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS |
                     CEPH_OSD_FLAG_LOCALIZE_READS)) &&
        !is_write && pi->is_replicated() && t->acting.size() > 1) {
      int osd;
      ceph_assert(is_read && t->acting[0] == acting_primary);
      if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) {
	int p = rand() % t->acting.size();
	if (p)
	  t->used_replica = true;
	osd = t->acting[p];
	ldout(cct, 10) << " chose random osd." << osd << " of " << t->acting
		       << dendl;
      } else {
	// look for a local replica.  prefer the primary if the
	// distance is the same.
	int best = -1;
	int best_locality = 0;
	for (unsigned i = 0; i < t->acting.size(); ++i) {
	  int locality = osdmap->crush->get_common_ancestor_distance(
		 cct, t->acting[i], crush_location);
	  ldout(cct, 20) << __func__ << " localize: rank " << i
			 << " osd." << t->acting[i]
			 << " locality " << locality << dendl;
	  if (i == 0 ||
	      (locality >= 0 && best_locality >= 0 &&
	       locality < best_locality) ||
	      (best_locality < 0 && locality >= 0)) {
	    best = i;
	    best_locality = locality;
	    if (i)
	      t->used_replica = true;
	  }
	}
	ceph_assert(best >= 0);
	osd = t->acting[best];
      }
      t->osd = osd;
    } else {
      t->osd = acting_primary;
    }
  }
  if (legacy_change || unpaused || force_resend) {
    return RECALC_OP_TARGET_NEED_RESEND;
  }
  if (split_or_merge &&
      (osdmap->require_osd_release >= ceph_release_t::luminous ||
       HAVE_FEATURE(osdmap->get_xinfo(acting_primary).features,
		    RESEND_ON_SPLIT))) {
    return RECALC_OP_TARGET_NEED_RESEND;
  }
  return RECALC_OP_TARGET_NO_ACTION;
}

_get_session


/**
 * Look up OSDSession by OSD id.
 *
 * @returns 0 on success, or -EAGAIN if the lock context requires
 * promotion to write.
 */
int Objecter::_get_session(int osd, OSDSession **session,
			   shunique_lock<ceph::shared_mutex>& sul)
{
  ceph_assert(sul && sul.mutex() == &rwlock);

  if (osd < 0) {
    *session = homeless_session;
    ldout(cct, 20) << __func__ << " osd=" << osd << " returning homeless"
		   << dendl;
    return 0;
  }

  auto p = osd_sessions.find(osd);
  if (p != osd_sessions.end()) {
    auto s = p->second;
    s->get();
    *session = s;
    ldout(cct, 20) << __func__ << " s=" << s << " osd=" << osd << " "
		   << s->get_nref() << dendl;
    return 0;
  }
  if (!sul.owns_lock()) {
    return -EAGAIN;
  }
  auto s = new OSDSession(cct, osd);
  osd_sessions[osd] = s;
  s->con = messenger->connect_to_osd(osdmap->get_addrs(osd));
  s->con->set_priv(RefCountedPtr{s});
  logger->inc(l_osdc_osd_session_open);
  logger->set(l_osdc_osd_sessions, osd_sessions.size());
  s->get();
  *session = s;
  ldout(cct, 20) << __func__ << " s=" << s << " osd=" << osd << " "
		 << s->get_nref() << dendl;
  return 0;
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值