ceph中monitor 可以分为leader节点和peon节点,leader节点是分解rank值来决定,rank值又和ip地址有关。
在ceph/mon/elector.cc 这个文件中实现的Elector 类用于选举monitor中的leader节点.
leader节点的选举从start 这个成员函数开始
void Elector::start()
{
leader_acked = -1;
#可以看到这里会通过mon->monmap->size() 得到总的monitor的size,然后想所有的monitor
#发送OP_PROPOSE 消息
// bcast to everyone else
for (unsigned i=0; i<mon->monmap->size(); ++i) {
if ((int)i == mon->rank) continue;
MMonElection *m =
new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);
m->mon_features = ceph::features::mon::get_supported();
mon->messenger->send_message(m, mon->monmap->get_inst(i));
}
}
在Elector 类中消息出来都是在dispatch 这个函数中,
switch (em->op) {
case MMonElection::OP_PROPOSE:
handle_propose(op);
return;
}
从dispatch中我们可以知道OP_PROPOSE 对应的处理函数是handle_propose
void Elector::handle_propose(MonOpRequestRef op)
{
#我们只分析这个case
// they would win over me
if (leader_acked < 0 || // haven't acked anyone yet, or
leader_acked > from || // they would win over who you did ack, or
leader_acked == from) { // this is the guy we're already deferring to
defer(from);
} else {
// ignore them!
dout(5) << "no, we already acked " << leader_acked << dendl;
}
}
}
void Elector::defer(int who)
{
#在这个函数中有发送消息OP_ACK
// ack them
leader_acked = who;
ack_stamp = ceph_clock_now();
MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap);
m->mon_features = ceph::features::mon::get_supported();
mon->collect_metadata(&m->metadata);
mon->messenger->send_message(m, mon->monmap->get_inst(who));
// set a timer
reset_timer(1.0); // give the leader some extra time to declare victory
}
根据dispatch函数中OP_ACK 对应的处理函数为
switch (em->op) {
case MMonElection::OP_ACK:
handle_ack(op);
return;
处理函数为handle_ack
void Elector::handle_ack(MonOpRequestRef op)
{
#如果选举当前节点作为leader
if (electing_me) {
// thanks
acked_me[from].cluster_features = m->get_connection()->get_features();
acked_me[from].mon_features = m->mon_features;
acked_me[from].metadata = m->metadata;
dout(5) << " so far i have {";
for (map<int, elector_info_t>::const_iterator p = acked_me.begin();
p != acked_me.end();
++p) {
if (p != acked_me.begin())
*_dout << ",";
*_dout << " mon." << p->first << ":"
<< " features " << p->second.cluster_features
<< " " << p->second.mon_features;
}
*_dout << " }" << dendl;
// is that _everyone_?
#如果所有的monitor都已经回复了,那就是调用victory 宣布当前节点是leader节点
if (acked_me.size() == mon->monmap->size()) {
// if yes, shortcut to election finish
victory();
}
} else {
// ignore, i'm deferring already.
assert(leader_acked >= 0);
}
}
void Elector::victory()
{
#告诉所有的monitor及诶单quorum/cluster_features 等信息
// tell everyone!
for (set<int>::iterator p = quorum.begin();
p != quorum.end();
++p) {
if (*p == mon->rank) continue;
#注意这里会发送OP_VICTORY 消息来帮自己设置为leader节点
MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch,
mon->monmap);
m->quorum = quorum;
m->quorum_features = cluster_features;
m->mon_features = mon_features;
m->sharing_bl = mon->get_local_commands_bl(mon_features);
mon->messenger->send_message(m, mon->monmap->get_inst(*p));
}
// 告诉monitor 已经当前节点被选为leader节点
mon->win_election(epoch, quorum,
cluster_features, mon_features, metadata);
}
OP_VICTORY消息对应的处理函数为:
void Elector::handle_victory(MonOpRequestRef op)
{
op->mark_event("elector:handle_victory");
MMonElection *m = static_cast<MMonElection*>(op->get_req());
dout(5) << "handle_victory from " << m->get_source()
<< " quorum_features " << m->quorum_features
<< " " << m->mon_features
<< dendl;
int from = m->get_source().num();
assert(from < mon->rank);
assert(m->epoch % 2 == 0);
leader_acked = -1;
// i should have seen this election if i'm getting the victory.
if (m->epoch != epoch + 1) {
dout(5) << "woah, that's a funny epoch, i must have rebooted. bumping and re-starting!" << dendl;
bump_epoch(m->epoch);
start();
return;
}
bump_epoch(m->epoch);
#输掉选举的话,把自己设置为peon节点
// they win
mon->lose_election(epoch, m->quorum, from,
m->quorum_features, m->mon_features);
// cancel my timer
cancel_timer();
}
如果有点monitor在规定的时间内没有回复则会调用expire
void Elector::expire()
{
dout(5) << "election timer expired" << dendl;
#从这里可以看到只要有超过一半的monitor 回复,仍然认为当前节点获胜为leader节点,否则就调用mon->bootstrap() 重新开始选举
// did i win?
if (electing_me &&
acked_me.size() > (unsigned)(mon->monmap->size() / 2)) {
// i win
victory();
} else {
// whoever i deferred to didn't declare victory quickly enough.
if (mon->has_ever_joined)
start();
else
mon->bootstrap();
}
}
ceph monitor 选举leader和peon的过程
最新推荐文章于 2024-10-28 20:54:47 发布