本篇博客主要分为三个部分
1.概述
2.选举的时机
3.选举的过程
1.概述
monitor在运行的过程中,必须存在一个leader节点来。众多的更新操作都是由leader节点来完成,写的命令也会经由peon转发到leader节点进行。
leader的选择是根据rank值来的,rank值小的为leader,而rank值跟IP地址有关。
2.选举的时机
在ceph中有大致三处会引发选举
1.收到quorum exit/enter
2.收到选举消息
3.bootstrap之后
之所以这样排序是因为从简单到复杂。
1)收到quorum exit/enter命令
在这个过程中participating会被改变,导致该monitor不再参与选举或者参与选举
void Monitor::handle_command(MonOpRequestRef op)
{
····
else if (prefix == "quorum") {
string quorumcmd;
cmd_getval(g_ceph_context, cmdmap, "quorumcmd", quorumcmd);
//quorum exit发起选举
if (quorumcmd == "exit") {
start_election();
elector.stop_participating();
rs = "stopped responding to quorum, initiated new election";
r = 0;
}
//quorum enter发起选举
else if (quorumcmd == "enter") {
elector.start_participating();
start_election();
rs = "started responding to quorum, initiated new election";
r = 0;
}
····
}
2)收到选举消息
monitor收到选举消息后
void Monitor::dispatch_op(MonOpRequestRef op)
{
···
case MSG_MON_ELECTION:
op->set_type_election();
//check privileges here for simplicity
if (!op->get_session()->is_capable("mon", MON_CAP_X)) {
dout(0) << "MMonElection received from entity without enough caps!"
<< op->get_session()->caps << dendl;
break;
}
if (!is_probing() && !is_synchronizing()) {
elector.dispatch(op);
}
break;
···
}
调用了void Elector::dispatch(MonOpRequestRef op)
这时收到的肯定是OP_PROPOSE
void Elector::dispatch(MonOpRequestRef op)
{
···
case MMonElection::OP_PROPOSE:
handle_propose(op);
return;
···
}
进入void Elector::handle_propose(MonOpRequestRef op)
void Elector::handle_propose(MonOpRequestRef op)
{
···
//如果收到一个“旧”的epoch,但是它并不在quorum里,发起选举
if (m->epoch < epoch) {
// got an "old" propose,
if (epoch % 2 == 0 && // in a non-election cycle
mon->quorum.count(from) == 0) { // from someone outside the quorum
// a mon just started up, call a new election so they can rejoin!
dout(5) << " got propose from old epoch, quorum is " << mon->quorum
<< ", " << m->get_source() << " must have just started" << dendl;
// we may be active; make sure we reset things in the monitor appropriately.
mon->start_election();
} else {
dout(5) << " ignoring old propose" << dendl;
return;
}
···
//如果自身的rank值比发消息的monitor rank值要小,这时候要分两种情况
//如果已经回复过别人,说明已经有更小的rank值,则不回复,若没有回复过别人则自身rank值较小应发起选举
if (mon->rank < from) {
// i would win over them.
if (leader_acked >= 0) { // we already acked someone
assert(leader_acked < from); // and they still win, of course
dout(5) << "no, we already acked " << leader_acked << dendl;
} else {
// wait, i should win!
if (!electing_me) {
mon->start_election();
}
}
}
···
}
3)bootstrap之后
在bootstrap中也有两种情况
第一种:单monitor集群
void Monitor::win_standalone_election()
void Monitor::win_standalone_election()
{
dout(1) << "win_standalone_election" << dendl;
// bump election epoch, in case the previous epoch included other
// monitors; we need to be able to make the distinction.
elector.init();
elector.advance_epoch();
rank = monmap->get_rank(name);
assert(rank == 0);
set<int> q;
q.insert(rank);
map<int,Metadata> metadata;
collect_metadata(&metadata[0]);
win_election(elector.get_epoch(), q,
CEPH_FEATURES_ALL,
ceph::features::mon::get_supported(),
metadata);
}
第二种:先probe其他monitor
博客中主要讲解第二种先probe其他monitor
void Monitor::bootstrap()
{
dout(10) << "bootstrap" << dendl;
····
//单monitor集群选举
// singleton monitor?
if (monmap->size() == 1 && rank == 0) {
win_standalone_election();
return;
}
reset_probe_timeout();
// i'm outside the quorum
if (monmap->contains(name))
outside_quorum.insert(name);
// probe monitors
dout(10) << "probing other monitors" << dendl;
for (unsigned i = 0; i < monmap->size(); i++) {
if ((int)i != rank)
messenger->send_message(new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined),
monmap->get_inst(i));
}
for (set<entity_addr_t>::iterator p = extra_probe_peers.begin();
p != extra_probe_peers.end();
++p) {
if (*p != messenger->get_myaddr()) {
entity_inst_t i;
i.name = entity_name_t::MON(-1);
i.addr = *p;
messenger->send_message(new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined), i);
}
}
}
接收方
void Monitor::handle_probe(MonOpRequestRef op)
{
MMonProbe *m = static_cast<MMonProbe*>(op->get_req());
dout(10) << "handle_probe " << *m << dendl;
//必须有相同的fsid
if (m->fsid != monmap->fsid) {
dout(0) << "handle_probe ignoring fsid " << m->fsid << " != " << monmap->fsid << dendl;
return;
}
switch (m->op) {
//处理probe
case MMonProbe::OP_PROBE:
handle_probe_probe(op);
break;
case MMonProbe::OP_REPLY:
handle_probe_reply(op);
break;
case MMonProbe::OP_MISSING_FEATURES:
derr << __func__