ceph monitor 选举leader和peon的过程

ceph中monitor 可以分为leader节点和peon节点,leader节点是分解rank值来决定,rank值又和ip地址有关。
在ceph/mon/elector.cc 这个文件中实现的Elector 类用于选举monitor中的leader节点.
leader节点的选举从start 这个成员函数开始
void Elector::start()
{
    leader_acked = -1;

#可以看到这里会通过mon->monmap->size() 得到总的monitor的size,然后想所有的monitor
#发送OP_PROPOSE 消息
  // bcast to everyone else
  for (unsigned i=0; i<mon->monmap->size(); ++i) {
    if ((int)i == mon->rank) continue;
    MMonElection *m =
      new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);
    m->mon_features = ceph::features::mon::get_supported();
    mon->messenger->send_message(m, mon->monmap->get_inst(i));
  }

}
在Elector 类中消息出来都是在dispatch 这个函数中,
    switch (em->op) {
      case MMonElection::OP_PROPOSE:
	handle_propose(op);
	return;
      }
从dispatch中我们可以知道OP_PROPOSE 对应的处理函数是handle_propose
void Elector::handle_propose(MonOpRequestRef op)
{
  #我们只分析这个case
    // they would win over me
    if (leader_acked < 0 ||      // haven't acked anyone yet, or
	leader_acked > from ||   // they would win over who you did ack, or
	leader_acked == from) {  // this is the guy we're already deferring to
      defer(from);
    } else {
      // ignore them!
      dout(5) << "no, we already acked " << leader_acked << dendl;
    }
  }
}
void Elector::defer(int who)
{
#在这个函数中有发送消息OP_ACK
  // ack them
  leader_acked = who;
  ack_stamp = ceph_clock_now();
  MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap);
  m->mon_features = ceph::features::mon::get_supported();
  mon->collect_metadata(&m->metadata);

  mon->messenger->send_message(m, mon->monmap->get_inst(who));
  
  // set a timer
  reset_timer(1.0);  // give the leader some extra time to declare victory
}

根据dispatch函数中OP_ACK 对应的处理函数为
      switch (em->op) {
      case MMonElection::OP_ACK:
	handle_ack(op);
	return;

处理函数为handle_ack
void Elector::handle_ack(MonOpRequestRef op)
{
#如果选举当前节点作为leader
  if (electing_me) {
    // thanks
    acked_me[from].cluster_features = m->get_connection()->get_features();
    acked_me[from].mon_features = m->mon_features;
    acked_me[from].metadata = m->metadata;
    dout(5) << " so far i have {";
    for (map<int, elector_info_t>::const_iterator p = acked_me.begin();
         p != acked_me.end();
         ++p) {
      if (p != acked_me.begin())
        *_dout << ",";
      *_dout << " mon." << p->first << ":"
             << " features " << p->second.cluster_features
             << " " << p->second.mon_features;
    }
    *_dout << " }" << dendl;

    // is that _everyone_?
	#如果所有的monitor都已经回复了,那就是调用victory 宣布当前节点是leader节点
    if (acked_me.size() == mon->monmap->size()) {
      // if yes, shortcut to election finish
      victory();
    }
  } else {
    // ignore, i'm deferring already.
    assert(leader_acked >= 0);
  }
}
void Elector::victory()
{
  
#告诉所有的monitor及诶单quorum/cluster_features 等信息
  // tell everyone!
  for (set<int>::iterator p = quorum.begin();
       p != quorum.end();
       ++p) {
    if (*p == mon->rank) continue;
	#注意这里会发送OP_VICTORY 消息来帮自己设置为leader节点
    MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch,
				       mon->monmap);
    m->quorum = quorum;
    m->quorum_features = cluster_features;
    m->mon_features = mon_features;
    m->sharing_bl = mon->get_local_commands_bl(mon_features);
    mon->messenger->send_message(m, mon->monmap->get_inst(*p));
  }

  // 告诉monitor 已经当前节点被选为leader节点
  mon->win_election(epoch, quorum,
                    cluster_features, mon_features, metadata);
}
OP_VICTORY消息对应的处理函数为:
void Elector::handle_victory(MonOpRequestRef op)
{
  op->mark_event("elector:handle_victory");
  MMonElection *m = static_cast<MMonElection*>(op->get_req());
  dout(5) << "handle_victory from " << m->get_source()
          << " quorum_features " << m->quorum_features
          << " " << m->mon_features
          << dendl;
  int from = m->get_source().num();

  assert(from < mon->rank);
  assert(m->epoch % 2 == 0);  

  leader_acked = -1;

  // i should have seen this election if i'm getting the victory.
  if (m->epoch != epoch + 1) { 
    dout(5) << "woah, that's a funny epoch, i must have rebooted.  bumping and re-starting!" << dendl;
    bump_epoch(m->epoch);
    start();
    return;
  }

  bump_epoch(m->epoch);
  #输掉选举的话,把自己设置为peon节点
  // they win
  mon->lose_election(epoch, m->quorum, from,
                     m->quorum_features, m->mon_features);

  // cancel my timer
  cancel_timer();

}

如果有点monitor在规定的时间内没有回复则会调用expire
void Elector::expire()
{
  dout(5) << "election timer expired" << dendl;
  #从这里可以看到只要有超过一半的monitor 回复,仍然认为当前节点获胜为leader节点,否则就调用mon->bootstrap() 重新开始选举
  // did i win?
  if (electing_me &&
      acked_me.size() > (unsigned)(mon->monmap->size() / 2)) {
    // i win
    victory();
  } else {
    // whoever i deferred to didn't declare victory quickly enough.
    if (mon->has_ever_joined)
      start();
    else
      mon->bootstrap();
  }
}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值