ceph源码分析--monitor leader选举

本篇博客主要分为三个部分
1.概述
2.选举的时机
3.选举的过程

1.概述

monitor在运行的过程中,必须存在一个leader节点来。众多的更新操作都是由leader节点来完成,写的命令也会经由peon转发到leader节点进行。

leader的选择是根据rank值来的,rank值小的为leader,而rank值跟IP地址有关。

2.选举的时机

在ceph中有大致三处会引发选举
1.收到quorum exit/enter
2.收到选举消息
3.bootstrap之后

之所以这样排序是因为从简单到复杂。

1)收到quorum exit/enter命令

在这个过程中participating会被改变,导致该monitor不再参与选举或者参与选举

void Monitor::handle_command(MonOpRequestRef op)
{
    ····
    else if (prefix == "quorum") {
        string quorumcmd;
        cmd_getval(g_ceph_context, cmdmap, "quorumcmd", quorumcmd);
        //quorum exit发起选举
        if (quorumcmd == "exit") {
          start_election();
          elector.stop_participating();
          rs = "stopped responding to quorum, initiated new election";
          r = 0;
        }
        //quorum enter发起选举
        else if (quorumcmd == "enter") {
          elector.start_participating();
          start_election();
          rs = "started responding to quorum, initiated new election";
          r = 0;
        }
    ····
}
2)收到选举消息

monitor收到选举消息后

void Monitor::dispatch_op(MonOpRequestRef op)
{
    ···
    case MSG_MON_ELECTION:
          op->set_type_election();
          //check privileges here for simplicity
          if (!op->get_session()->is_capable("mon", MON_CAP_X)) {
            dout(0) << "MMonElection received from entity without enough caps!"
              << op->get_session()->caps << dendl;
            break;
          }
          if (!is_probing() && !is_synchronizing()) {
            elector.dispatch(op);
          }
          break;
···
}

调用了void Elector::dispatch(MonOpRequestRef op)
这时收到的肯定是OP_PROPOSE

void Elector::dispatch(MonOpRequestRef op)
{
···
    case MMonElection::OP_PROPOSE:
        handle_propose(op);
        return;
···
}

进入void Elector::handle_propose(MonOpRequestRef op)

void Elector::handle_propose(MonOpRequestRef op)
{
···
//如果收到一个“旧”的epoch,但是它并不在quorum里,发起选举
if (m->epoch < epoch) {
    // got an "old" propose,
    if (epoch % 2 == 0 &&    // in a non-election cycle
    mon->quorum.count(from) == 0) {  // from someone outside the quorum
      // a mon just started up, call a new election so they can rejoin!
      dout(5) << " got propose from old epoch, quorum is " << mon->quorum 
          << ", " << m->get_source() << " must have just started" << dendl;
      // we may be active; make sure we reset things in the monitor appropriately.
      mon->start_election();
    } else {
      dout(5) << " ignoring old propose" << dendl;
      return;
    }
···
//如果自身的rank值比发消息的monitor rank值要小,这时候要分两种情况
//如果已经回复过别人,说明已经有更小的rank值,则不回复,若没有回复过别人则自身rank值较小应发起选举
if (mon->rank < from) {
    // i would win over them.
    if (leader_acked >= 0) {        // we already acked someone
      assert(leader_acked < from);  // and they still win, of course
      dout(5) << "no, we already acked " << leader_acked << dendl;
    } else {
      // wait, i should win!
      if (!electing_me) {
    mon->start_election();
      }
    }
  }
···
}
3)bootstrap之后

在bootstrap中也有两种情况
第一种:单monitor集群
void Monitor::win_standalone_election()

void Monitor::win_standalone_election()
{
  dout(1) << "win_standalone_election" << dendl;

  // bump election epoch, in case the previous epoch included other
  // monitors; we need to be able to make the distinction.
  elector.init();
  elector.advance_epoch();

  rank = monmap->get_rank(name);
  assert(rank == 0);
  set<int> q;
  q.insert(rank);

  map<int,Metadata> metadata;
  collect_metadata(&metadata[0]);

  win_election(elector.get_epoch(), q,
               CEPH_FEATURES_ALL,
               ceph::features::mon::get_supported(),
           metadata);
}

第二种:先probe其他monitor

博客中主要讲解第二种先probe其他monitor

void Monitor::bootstrap()
{
  dout(10) << "bootstrap" << dendl;
  ····
  //单monitor集群选举
  // singleton monitor?
  if (monmap->size() == 1 && rank == 0) {
    win_standalone_election();
    return;
  }

  reset_probe_timeout();

  // i'm outside the quorum
  if (monmap->contains(name))
    outside_quorum.insert(name);

  // probe monitors
  dout(10) << "probing other monitors" << dendl;
  for (unsigned i = 0; i < monmap->size(); i++) {
    if ((int)i != rank)
      messenger->send_message(new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined),
                  monmap->get_inst(i));
  }
  for (set<entity_addr_t>::iterator p = extra_probe_peers.begin();
       p != extra_probe_peers.end();
       ++p) {
    if (*p != messenger->get_myaddr()) {
      entity_inst_t i;
      i.name = entity_name_t::MON(-1);
      i.addr = *p;
      messenger->send_message(new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined), i);
    }
  }
}

接收方

void Monitor::handle_probe(MonOpRequestRef op)
{
  MMonProbe *m = static_cast<MMonProbe*>(op->get_req());
  dout(10) << "handle_probe " << *m << dendl;
 //必须有相同的fsid
  if (m->fsid != monmap->fsid) {
    dout(0) << "handle_probe ignoring fsid " << m->fsid << " != " << monmap->fsid << dendl;
    return;
  }

  switch (m->op) {
  //处理probe
  case MMonProbe::OP_PROBE:
    handle_probe_probe(op);
    break;

  case MMonProbe::OP_REPLY:
    handle_probe_reply(op);
    break;

  case MMonProbe::OP_MISSING_FEATURES:
    derr << __func__ << " missing features, have " << CEPH_FEATURES_ALL
     << ", required " << m->required_features
     << ", missing " << (m->required_features & ~CEPH_FEATURES_ALL)
     << dendl;
    break;
  }
}

void Monitor::handle_probe_probe(MonOpRequestRef op)
{
···
  if (!is_probing() && !is_synchronizing()) {
    // If the probing mon is way ahead of us, we need to re-bootstrap.
    // Normally we capture this case when we initially bootstrap, but
    // it is possible we pass those checks (we overlap with
    // quorum-to-be) but fail to join a quorum before it moves past
    // us.  We need to be kicked back to bootstrap so we can
    // synchonize, not keep calling elections.
    //from端数据比自己新,重新bootstrap
    if (paxos->get_version() + 1 < m->paxos_first_version) {
      dout(1) << " peer " << m->get_source_addr() << " has first_committed "
          << "ahead of us, re-bootstrapping" << dendl;
      bootstrap();
      goto out;

    }
  }

  MMonProbe *r;
  //
  r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY, name, has_ever_joined);
  r->name = name;
  r->quorum = quorum;
  monmap->encode(r->monmap_bl, m->get_connection()->get_features());
  r->paxos_first_version = paxos->get_first_committed();
  r->paxos_last_version = paxos->get_version();
  m->get_connection()->send_message(r);

  // did we discover a peer here?
  if (!monmap->contains(m->get_source_addr())) {
    dout(1) << " adding peer " << m->get_source_addr()
        << " to list of hints" << dendl;
    extra_probe_peers.insert(m->get_source_addr());
  }

 out:
  return;
}

处理probe_reply消息,在这期间会同步monitor的一些数据

void Monitor::handle_probe_reply(MonOpRequestRef op)
{
  MMonProbe *m = static_cast<MMonProbe*>(op->get_req());
  dout(10) << "handle_probe_reply " << m->get_source_inst() << *m << dendl;
  dout(10) << " monmap is " << *monmap << dendl;

  // discover name and addrs during probing or electing states.
  if (!is_probing() && !is_electing()) {
    return;
  }

  // newer map, or they've joined a quorum and we haven't?
  bufferlist mybl;
  monmap->encode(mybl, m->get_connection()->get_features());
  // make sure it's actually different; the checks below err toward
  // taking the other guy's map, which could cause us to loop.
  if (!mybl.contents_equal(m->monmap_bl)) {
    MonMap *newmap = new MonMap;
    newmap->decode(m->monmap_bl);
    if (m->has_ever_joined && (newmap->get_epoch() > monmap->get_epoch() ||
                   !has_ever_joined)) {
      dout(10) << " got newer/committed monmap epoch " << newmap->get_epoch()
           << ", mine was " << monmap->get_epoch() << dendl;
      delete newmap;
      monmap->decode(m->monmap_bl);
     //远端的monmap更新,重新进行bootstrap
      bootstrap();
      return;
    }
    delete newmap;
  }

  // rename peer?
  string peer_name = monmap->get_name(m->get_source_addr());
  if (monmap->get_epoch() == 0 && peer_name.compare(0, 7, "noname-") == 0) {
    dout(10) << " renaming peer " << m->get_source_addr() << " "
         << peer_name << " -> " << m->name << " in my monmap"
         << dendl;
    monmap->rename(peer_name, m->name);

    if (is_electing()) {
      bootstrap();
      return;
    }
  } else {
    dout(10) << " peer name is " << peer_name << dendl;
  }

  // new initial peer?
  if (monmap->get_epoch() == 0 &&
      monmap->contains(m->name) &&
      monmap->get_addr(m->name).is_blank_ip()) {
    dout(1) << " learned initial mon " << m->name << " addr " << m->get_source_addr() << dendl;
    monmap->set_addr(m->name, m->get_source_addr());

    bootstrap();
    return;
  }

  // end discover phase
  if (!is_probing()) {
    return;
  }

  assert(paxos != NULL);

  if (is_synchronizing()) {
    dout(10) << " currently syncing" << dendl;
    return;
  }

  entity_inst_t other = m->get_source_inst();

  if (m->paxos_last_version < sync_last_committed_floor) {
    dout(10) << " peer paxos versions [" << m->paxos_first_version
         << "," << m->paxos_last_version << "] < my sync_last_committed_floor "
         << sync_last_committed_floor << ", ignoring"
         << dendl;
  } else {
    if (paxos->get_version() < m->paxos_first_version &&
    m->paxos_first_version > 1) {  // no need to sync if we're 0 and they start at 1.
      dout(10) << " peer paxos first versions [" << m->paxos_first_version
           << "," << m->paxos_last_version << "]"
           << " vs my version " << paxos->get_version()
           << " (too far ahead)"
           << dendl;
      cancel_probe_timeout();
      sync_start(other, true);
      return;
    }
    if (paxos->get_version() + g_conf->paxos_max_join_drift < m->paxos_last_version) {
      dout(10) << " peer paxos last version " << m->paxos_last_version
           << " vs my version " << paxos->get_version()
           << " (too far ahead)"
           << dendl;
      cancel_probe_timeout();
      sync_start(other, false);
      return;
    }
  }

  // is there an existing quorum?
  if (m->quorum.size()) {
    dout(10) << " existing quorum " << m->quorum << dendl;

    dout(10) << " peer paxos version " << m->paxos_last_version
             << " vs my version " << paxos->get_version()
             << " (ok)"
             << dendl;

    if (monmap->contains(name) &&
        !monmap->get_addr(name).is_blank_ip()) {
      // i'm part of the cluster; just initiate a new election
      //已经在monmap,直接发起选举
      start_election();
    } else {
      dout(10) << " ready to join, but i'm not in the monmap or my addr is blank, trying to join" << dendl;
      messenger->send_message(new MMonJoin(monmap->fsid, name, messenger->get_myaddr()),
                              monmap->get_inst(*m->quorum.begin()));
    }
  } else {
    if (monmap->contains(m->name)) {
      dout(10) << " mon." << m->name << " is outside the quorum" << dendl;
      outside_quorum.insert(m->name);
    } else {
      dout(10) << " mostly ignoring mon." << m->name << ", not part of monmap" << dendl;
      return;
    }

    unsigned need = monmap->size() / 2 + 1;
    dout(10) << " outside_quorum now " << outside_quorum << ", need " << need << dendl;
    if (outside_quorum.size() >= need) {
      if (outside_quorum.count(name)) {
        //集群必须有半数以上的monitor可用,才能发起选举
        dout(10) << " that's enough to form a new quorum, calling election" << dendl;
        start_election();
      } else {
        dout(10) << " that's enough to form a new quorum, but it does not include me; waiting" << dendl;
      }
    } else {
      dout(10) << " that's not yet enough for a new quorum, waiting" << dendl;
    }
  }
}

经过这个些步骤之后,开始选举

3.选举的过程

选举从void Monitor::start_election()开始

void Monitor::start_election()
{
  dout(10) << "start_election" << dendl;
  wait_for_paxos_write();
  _reset();
  state = STATE_ELECTING;

  logger->inc(l_mon_num_elections);
  logger->inc(l_mon_election_call);

  clog->info() << "mon." << name << " calling new monitor election";
  elector.call_election();
}

void call_election()

void call_election() {
    start();
  }

void Elector::start()

void Elector::start()
{
  //ceph quorum exit退出之后这个值会置成false
  if (!participating) {
    dout(0) << "not starting new election -- not participating" << dendl;
    return;
  }
  dout(5) << "start -- can i be leader?" << dendl;
  //清空ack的集合
  acked_me.clear();
  init();

  // start by trying to elect me
  if (epoch % 2 == 0) {
    bump_epoch(epoch+1);  // odd == election cycle
  } else {
    // do a trivial db write just to ensure it is writeable.
    auto t(std::make_shared<MonitorDBStore::Transaction>());
    t->put(Monitor::MONITOR_NAME, "election_writeable_test", rand());
    int r = mon->store->apply_transaction(t);
    assert(r >= 0);
  }
  start_stamp = ceph_clock_now();
  electing_me = true;
  acked_me[mon->rank].cluster_features = CEPH_FEATURES_ALL;
  acked_me[mon->rank].mon_features = ceph::features::mon::get_supported();
  mon->collect_metadata(&acked_me[mon->rank].metadata);
  leader_acked = -1;

  // bcast to everyone else
  for (unsigned i=0; i<mon->monmap->size(); ++i) {
    if ((int)i == mon->rank) continue;
    //发起选举消息
    MMonElection *m =
      new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);
    m->mon_features = ceph::features::mon::get_supported();
    mon->messenger->send_message(m, mon->monmap->get_inst(i));
  }

  reset_timer();
}

void Elector::init()

void Elector::init()
{
  //从db中取出epoch号+1,如果没有从1开始
  epoch = mon->store->get(Monitor::MONITOR_NAME, "election_epoch");
  if (!epoch) {
    dout(1) << "init, first boot, initializing epoch at 1 " << dendl;
    epoch = 1;
  } else if (epoch % 2) {
    dout(1) << "init, last seen epoch " << epoch
        << ", mid-election, bumping" << dendl;
    ++epoch;
    auto t(std::make_shared<MonitorDBStore::Transaction>());
    t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
    mon->store->apply_transaction(t);
  } else {
    dout(1) << "init, last seen epoch " << epoch << dendl;
  }
}

void Elector::handle_propose(MonOpRequestRef op)

void Elector::handle_propose(MonOpRequestRef op)
{
  op->mark_event("elector:handle_propose");
  MMonElection *m = static_cast<MMonElection*>(op->get_req());
  dout(5) << "handle_propose from " << m->get_source() << dendl;
  int from = m->get_source().num();

  assert(m->epoch % 2 == 1); // election
  uint64_t required_features = mon->get_required_features();
  mon_feature_t required_mon_features = mon->get_required_mon_features();

  dout(10) << __func__ << " required features " << required_features
           << " " << required_mon_features
           << ", peer features " << m->get_connection()->get_features()
           << " " << m->mon_features
           << dendl;

  if ((required_features ^ m->get_connection()->get_features()) &
      required_features) {
    dout(5) << " ignoring propose from mon" << from
        << " without required features" << dendl;
    nak_old_peer(op);
    return;
  } else if (!m->mon_features.contains_all(required_mon_features)) {
    // all the features in 'required_mon_features' not in 'm->mon_features'
    mon_feature_t missing = required_mon_features.diff(m->mon_features);
    dout(5) << " ignoring propose from mon." << from
            << " without required mon_features " << missing
            << dendl;
    nak_old_peer(op);
  } else if (m->epoch > epoch) {
    bump_epoch(m->epoch);
  } else if (m->epoch < epoch) {
    // got an "old" propose,
    if (epoch % 2 == 0 &&    // in a non-election cycle
    //收到集群外节点发来的选举消息,发起选举
    mon->quorum.count(from) == 0) {  // from someone outside the quorum
      // a mon just started up, call a new election so they can rejoin!
      dout(5) << " got propose from old epoch, quorum is " << mon->quorum 
          << ", " << m->get_source() << " must have just started" << dendl;
      // we may be active; make sure we reset things in the monitor appropriately.
      mon->start_election();
    } else {
      dout(5) << " ignoring old propose" << dendl;
      return;
    }
  }
//如果本方的rank小于对方的rank值,如果回复过别人则不做处理,如果未回复过别人则发起选举。如果本方的rank大于对方的rank值,且之前回复的rank值较小,则不回复本次,如果本次的rank值较小则调用defer发ack。
  if (mon->rank < from) {
    // i would win over them.
    if (leader_acked >= 0) {        // we already acked someone
      assert(leader_acked < from);  // and they still win, of course
      dout(5) << "no, we already acked " << leader_acked << dendl;
    } else {
      // wait, i should win!
      if (!electing_me) {
    mon->start_election();
      }
    }
  } else {
    // they would win over me
    if (leader_acked < 0 ||      // haven't acked anyone yet, or
    leader_acked > from ||   // they would win over who you did ack, or
    leader_acked == from) {  // this is the guy we're already deferring to
      defer(from);
    } else {
      // ignore them!
      dout(5) << "no, we already acked " << leader_acked << dendl;
    }
  }
}

void Elector::defer(int who)

void Elector::defer(int who)
{
  dout(5) << "defer to " << who << dendl;

  if (electing_me) {
    // drop out
    acked_me.clear();
    electing_me = false;
  }

  // ack them
  leader_acked = who;
  ack_stamp = ceph_clock_now();
  MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap);
  m->mon_features = ceph::features::mon::get_supported();
  mon->collect_metadata(&m->metadata);

  // This field is unused completely in luminous, but jewel uses it to
  // determine whether we are a dumpling mon due to some crufty old
  // code.  It only needs to see this buffer non-empty, so put
  // something useless there.
  m->sharing_bl = mon->get_local_commands_bl(mon->get_required_mon_features());

  mon->messenger->send_message(m, mon->monmap->get_inst(who));

  // set a timer
  reset_timer(1.0);  // give the leader some extra time to declare victory
}

选举不超时的情况下
void Elector::handle_ack(MonOpRequestRef op)

void Elector::handle_ack(MonOpRequestRef op)
{
  op->mark_event("elector:handle_ack");
  MMonElection *m = static_cast<MMonElection*>(op->get_req());
  dout(5) << "handle_ack from " << m->get_source() << dendl;
  int from = m->get_source().num();

  assert(m->epoch % 2 == 1); // election
  if (m->epoch > epoch) {
    dout(5) << "woah, that's a newer epoch, i must have rebooted.  bumping and re-starting!" << dendl;
    bump_epoch(m->epoch);
    start();
    return;
  }
  assert(m->epoch == epoch);
  uint64_t required_features = mon->get_required_features();
  if ((required_features ^ m->get_connection()->get_features()) &
      required_features) {
    dout(5) << " ignoring ack from mon" << from
        << " without required features" << dendl;
    return;
  }

  mon_feature_t required_mon_features = mon->get_required_mon_features();
  if (!m->mon_features.contains_all(required_mon_features)) {
    mon_feature_t missing = required_mon_features.diff(m->mon_features);
    dout(5) << " ignoring ack from mon." << from
            << " without required mon_features " << missing
            << dendl;
    return;
  }

  if (electing_me) {
    // thanks
    acked_me[from].cluster_features = m->get_connection()->get_features();
    acked_me[from].mon_features = m->mon_features;
    acked_me[from].metadata = m->metadata;
    dout(5) << " so far i have {";
    for (map<int, elector_info_t>::const_iterator p = acked_me.begin();
         p != acked_me.end();
         ++p) {
      if (p != acked_me.begin())
        *_dout << ",";
      *_dout << " mon." << p->first << ":"
             << " features " << p->second.cluster_features
             << " " << p->second.mon_features;
    }
    *_dout << " }" << dendl;

    // is that _everyone_?
    //获得所有的ack之后
    if (acked_me.size() == mon->monmap->size()) {
      // if yes, shortcut to election finish
      victory();
    }
  } else {
    // ignore, i'm deferring already.
    assert(leader_acked >= 0);
  }
}

选举超时的情况下

void Elector::expire()
{
  dout(5) << "election timer expired" << dendl;
  //超时情况下半数以上monitor回复即可
  // did i win?
  if (electing_me &&
      acked_me.size() > (unsigned)(mon->monmap->size() / 2)) {
    // i win
    victory();
  } else {
    // whoever i deferred to didn't declare victory quickly enough.
    if (mon->has_ever_joined)
      start();
    else
      mon->bootstrap();
  }
}

如果赢得选举
void Elector::victory()

void Elector::victory()
{
  leader_acked = -1;
  electing_me = false;

  uint64_t cluster_features = CEPH_FEATURES_ALL;
  mon_feature_t mon_features = ceph::features::mon::get_supported();
  set<int> quorum;
  map<int,Metadata> metadata;
  for (map<int, elector_info_t>::iterator p = acked_me.begin();
       p != acked_me.end();
       ++p) {
    quorum.insert(p->first);
    cluster_features &= p->second.cluster_features;
    mon_features &= p->second.mon_features;
    metadata[p->first] = p->second.metadata;
  }

  cancel_timer();

  assert(epoch % 2 == 1);  // election
  bump_epoch(epoch+1);     // is over!

  // tell everyone!
  for (set<int>::iterator p = quorum.begin();
       p != quorum.end();
       ++p) {
    if (*p == mon->rank) continue;
    MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch,
                       mon->monmap);
    m->quorum = quorum;
    m->quorum_features = cluster_features;
    m->mon_features = mon_features;
    m->sharing_bl = mon->get_local_commands_bl(mon_features);
    mon->messenger->send_message(m, mon->monmap->get_inst(*p));
  }

  // tell monitor
  //赢得选举,把自己标为leader
  mon->win_election(epoch, quorum,
                    cluster_features, mon_features, metadata);
}

void Elector::handle_victory(MonOpRequestRef op)

void Elector::handle_victory(MonOpRequestRef op)
{
  op->mark_event("elector:handle_victory");
  MMonElection *m = static_cast<MMonElection*>(op->get_req());
  dout(5) << "handle_victory from " << m->get_source()
          << " quorum_features " << m->quorum_features
          << " " << m->mon_features
          << dendl;
  int from = m->get_source().num();

  assert(from < mon->rank);
  assert(m->epoch % 2 == 0);  

  leader_acked = -1;

  // i should have seen this election if i'm getting the victory.
  if (m->epoch != epoch + 1) { 
    dout(5) << "woah, that's a funny epoch, i must have rebooted.  bumping and re-starting!" << dendl;
    bump_epoch(m->epoch);
    start();
    return;
  }

  bump_epoch(m->epoch);

  // they win
  //输掉选举,把自己标成peon
  mon->lose_election(epoch, m->quorum, from,
                     m->quorum_features, m->mon_features);

  // cancel my timer
  cancel_timer();

  // stash leader's commands
  assert(m->sharing_bl.length());
  vector<MonCommand> new_cmds;
  bufferlist::iterator bi = m->sharing_bl.begin();
  MonCommand::decode_vector(new_cmds, bi);
  mon->set_leader_commands(new_cmds);
}

初始化leader
void Monitor::win_election(epoch_t epoch, set& active, uint64_t features,const mon_feature_t& mon_features,const map

void Monitor::win_election(epoch_t epoch, set<int>& active, uint64_t features,const mon_feature_t& mon_features,const map<int,Metadata>& metadata)
{
  dout(10) << __func__ << " epoch " << epoch << " quorum " << active
       << " features " << features
           << " mon_features " << mon_features
           << dendl;
  assert(is_electing());
  state = STATE_LEADER;
  leader_since = ceph_clock_now();
  leader = rank;
  quorum = active;
  quorum_con_features = features;
  quorum_mon_features = mon_features;
  pending_metadata = metadata;
  outside_quorum.clear();

  clog->info() << "mon." << name << "@" << rank
        << " won leader election with quorum " << quorum;

  set_leader_commands(get_local_commands(mon_features));

  paxos->leader_init();
  // NOTE: tell monmap monitor first.  This is important for the
  // bootstrap case to ensure that the very first paxos proposal
  // codifies the monmap.  Otherwise any manner of chaos can ensue
  // when monitors are call elections or participating in a paxos
  // round without agreeing on who the participants are.
  monmon()->election_finished();
  _finish_svc_election();
  health_monitor->start(epoch);

  logger->inc(l_mon_election_win);

  // inject new metadata in first transaction.
  {
    // include previous metadata for missing mons (that aren't part of
    // the current quorum).
    map<int,Metadata> m = metadata;
    for (unsigned rank = 0; rank < monmap->size(); ++rank) {
      if (m.count(rank) == 0 &&
      mon_metadata.count(rank)) {
    m[rank] = mon_metadata[rank];
      }
    }

    // FIXME: This is a bit sloppy because we aren't guaranteed to submit
    // a new transaction immediately after the election finishes.  We should
    // do that anyway for other reasons, though.
    MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
    bufferlist bl;
    ::encode(m, bl);
    t->put(MONITOR_STORE_PREFIX, "last_metadata", bl);
  }

  finish_election();
  if (monmap->size() > 1 &&
      monmap->get_epoch() > 0) {
    timecheck_start();
    health_tick_start();
    do_health_to_clog_interval();
    scrub_event_start();
  }
}

输掉选举,把自己初始化成peon
void Monitor::lose_election(epoch_t epoch, set &q, int l,uint64_t features,const mon_feature_t& mon_features)

void Monitor::lose_election(epoch_t epoch, set<int> &q, int l,
                            uint64_t features,
                            const mon_feature_t& mon_features)
{
  state = STATE_PEON;
  leader_since = utime_t();
  leader = l;
  quorum = q;
  outside_quorum.clear();
  quorum_con_features = features;
  quorum_mon_features = mon_features;
  dout(10) << "lose_election, epoch " << epoch << " leader is mon" << leader
       << " quorum is " << quorum << " features are " << quorum_con_features
           << " mon_features are " << quorum_mon_features
           << dendl;

  paxos->peon_init();
  _finish_svc_election();
  health_monitor->start(epoch);

  logger->inc(l_mon_election_lose);

  finish_election();

  if ((quorum_con_features & CEPH_FEATURE_MON_METADATA) &&
      !HAVE_FEATURE(quorum_con_features, SERVER_LUMINOUS)) {
    // for pre-luminous mons only
    Metadata sys_info;
    collect_metadata(&sys_info);
    messenger->send_message(new MMonMetadata(sys_info),
                monmap->get_inst(get_leader()));
  }
}

选举结束void Monitor::finish_election()

void Monitor::finish_election()
{
  apply_quorum_to_compatset_features();
  apply_monmap_to_compatset_features();
  timecheck_finish();
  exited_quorum = utime_t();
  finish_contexts(g_ceph_context, waitfor_quorum);
  finish_contexts(g_ceph_context, maybe_wait_for_quorum);
  resend_routed_requests();
  update_logger();
  register_cluster_logger();

  // am i named properly?
  string cur_name = monmap->get_name(messenger->get_myaddr());
  if (cur_name != name) {
    dout(10) << " renaming myself from " << cur_name << " -> " << name << dendl;
    messenger->send_message(new MMonJoin(monmap->fsid, name, messenger->get_myaddr()),
                monmap->get_inst(*quorum.begin()));
  }
}

选举的基本流程已经基本结束
关于选举的部分还遗留着几个问题
1.handle_nak没有介绍
2.完成选举之后的初始化(paxos)
3.为什么收到集群外的选举需要重新发起选举?

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值