2021SC@SDUSC
背景介绍
该部分是选举过程最核心的代码。
正式选举过程中需要知道以下两个个名词。
(2)zxid:服务器在运行时产生的数据id,zxid越大,表示数据越新。
(3)epoch:选举的轮数,即逻辑时钟。随着选举的轮数增加
回顾
while (running) {
switch (getPeerState()) {
case LOOKING:
LOG.info("LOOKING");
if (Boolean.getBoolean("readonlymode.enabled")) {
LOG.info("Attempting to start ReadOnlyZooKeeperServer");
// Create read-only server but don't start it immediately
final ReadOnlyZooKeeperServer roZk = new ReadOnlyZooKeeperServer(
logFactory, this,
new ZooKeeperServer.BasicDataTreeBuilder(),
this.zkDb);
// Instead of starting roZk immediately, wait some grace
// period before we decide we're partitioned.
//
// Thread is used here because otherwise it would require
// changes in each of election strategy classes which is
// unnecessary code coupling.
Thread roZkMgr = new Thread() {
public void run() {
try {
// lower-bound grace period to 2 secs
sleep(Math.max(2000, tickTime));
if (ServerState.LOOKING.equals(getPeerState())) {
roZk.startup();
}
} catch (InterruptedException e) {
LOG.info("Interrupted while attempting to start ReadOnlyZooKeeperServer, not started");
} catch (Exception e) {
LOG.error("FAILED to start ReadOnlyZooKeeperServer", e);
}
}
};
try {
roZkMgr.start();
setBCVote(null);
setCurrentVote(makeLEStrategy().lookForLeader());
上一篇分析到这里,如果LOOKING将会开始选举。
源码分析
我们跳转到 lookForLeader()方法的实现,有如下代码:
public Vote lookForLeader() throws InterruptedException {
try {
self.jmxLeaderElectionBean = new LeaderElectionBean();
MBeanRegistry.getInstance().register(
self.jmxLeaderElectionBean, self.jmxLocalPeerBean);
} catch (Exception e) {
LOG.warn("Failed to register with JMX", e);
self.jmxLeaderElectionBean = null;
}
try {
HashMap<InetSocketAddress, Vote> recvset =
new HashMap<InetSocketAddress, Vote>();
HashMap<InetSocketAddress, Vote> outofelection =
new HashMap<InetSocketAddress, Vote>();
logicalclock++;
proposedLeader = self.getId();
proposedZxid = self.getLastLoggedZxid();
LOG.info("Election tally");
sendNotifications();
/*
* Loop in which we exchange notifications until we find a leader
*/
while (self.getPeerState() == ServerState.LOOKING) {
/*
* Remove next notification from queue, times out after 2 times
* the termination time
*/
Notification n = recvqueue.poll(2 * finalizeWait,
TimeUnit.MILLISECONDS);
/*
* Sends more notifications if haven't received enough.
* Otherwise processes new notification.
*/
if (n == null) {
if (((!outofelection.isEmpty()) || (recvset.size() > 1)))
sendNotifications();
} else
switch (n.state) {
case LOOKING:
if (n.epoch > logicalclock) {
logicalclock = n.epoch;
recvset.clear();
if (totalOrderPredicate(n.leader, n.zxid)) {
proposedLeader = n.leader;
proposedZxid = n.zxid;
}
sendNotifications();
} else if (n.epoch < logicalclock) {
break;
} else if (totalOrderPredicate(n.leader, n.zxid)) {
proposedLeader = n.leader;
proposedZxid = n.zxid;
sendNotifications();
}
recvset.put(n.addr, new Vote(n.leader, n.zxid));
// If have received from all nodes, then terminate
if (self.getVotingView().size() == recvset.size()) {
self.setPeerState((proposedLeader == self.getId()) ?
ServerState.LEADING: ServerState.FOLLOWING);
// if (self.state == ServerState.FOLLOWING) {
// Thread.sleep(100);
// }
leaveInstance();
return new Vote(proposedLeader, proposedZxid);
} else if (termPredicate(recvset, proposedLeader,
proposedZxid)) {
// Otherwise, wait for a fixed amount of time
LOG.info("Passed predicate");
Thread.sleep(finalizeWait);
// Notification probe = recvqueue.peek();
// Verify if there is any change in the proposed leader
while ((!recvqueue.isEmpty())
&& !totalOrderPredicate(
recvqueue.peek().leader, recvqueue
.peek().zxid)) {
recvqueue.poll();
}
if (recvqueue.isEmpty()) {
// LOG.warn("Proposed leader: " +
// proposedLeader);
self.setPeerState(
(proposedLeader == self.getId()) ?
ServerState.LEADING :
ServerState.FOLLOWING);
leaveInstance();
return new Vote(proposedLeader, proposedZxid);
}
}
break;
case LEADING:
outofelection.put(n.addr, new Vote(n.leader, n.zxid));
if (termPredicate(outofelection, n.leader, n.zxid)) {
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: ServerState.FOLLOWING);
leaveInstance();
return new Vote(n.leader, n.zxid);
}
break;
case FOLLOWING:
outofelection.put(n.addr, new Vote(n.leader, n.zxid));
if (termPredicate(outofelection, n.leader, n.zxid)) {
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: ServerState.FOLLOWING);
leaveInstance();
return new Vote(n.leader, n.zxid);
}
break;
default:
break;
}
}
return null;
} finally {
try {
if(self.jmxLeaderElectionBean != null){
MBeanRegistry.getInstance().unregister(
self.jmxLeaderElectionBean);
}
} catch (Exception e) {
LOG.warn("Failed to unregister with JMX", e);
}
self.jmxLeaderElectionBean = null;
}
}
}
逐个分析。
首先通过 logicalclock++将时钟加一,再通过sendNotifications()将消息发送出去。
然后通过 while (self.getPeerState() == ServerState.LOOKING) 不断循环判断。
case LOOKING:
if (n.epoch > logicalclock) {
logicalclock = n.epoch;
recvset.clear();
if (totalOrderPredicate(n.leader, n.zxid)) {
proposedLeader = n.leader;
proposedZxid = n.zxid;
}
sendNotifications();
} else if (n.epoch < logicalclock) {
break;
} else if (totalOrderPredicate(n.leader, n.zxid)) {
proposedLeader = n.leader;
proposedZxid = n.zxid;
sendNotifications();
}
recvset.put(n.addr, new Vote(n.leader, n.zxid));
// If have received from all nodes, then terminate
if (self.getVotingView().size() == recvset.size()) {
self.setPeerState((proposedLeader == self.getId()) ?
ServerState.LEADING: ServerState.FOLLOWING);
// if (self.state == ServerState.FOLLOWING) {
// Thread.sleep(100);
// }
leaveInstance();
return new Vote(proposedLeader, proposedZxid);
} else if (termPredicate(recvset, proposedLeader,
proposedZxid)) {
// Otherwise, wait for a fixed amount of time
LOG.info("Passed predicate");
Thread.sleep(finalizeWait);
// Notification probe = recvqueue.peek();
// Verify if there is any change in the proposed leader
while ((!recvqueue.isEmpty())
&& !totalOrderPredicate(
recvqueue.peek().leader, recvqueue
.peek().zxid)) {
recvqueue.poll();
}
if (recvqueue.isEmpty()) {
// LOG.warn("Proposed leader: " +
// proposedLeader);
self.setPeerState(
(proposedLeader == self.getId()) ?
ServerState.LEADING :
ServerState.FOLLOWING);
leaveInstance();
return new Vote(proposedLeader, proposedZxid);
}
}
break;
在LOOKING下,判断接受到的时钟和自身逻辑时钟的相对大小。
1 如果大于:如果发现收到的选举信息中的时钟大于自己的时钟(说明自己的时钟落后),则将自己时钟改为消息中的时钟,清空选票信息(消息也落后了)。然后判断是否能够根据目前的选票选出leader(通过totalOrderPredicate)。
我们查看totalOrderPredicate即可发现原理:
先比较数据zxid,数据zxid大者胜出;其次再判断lid, 大者胜出;然后再将自身最新的选举结果发送出去。
private boolean totalOrderPredicate(long id, long zxid) {
if ((zxid > proposedZxid)
|| ((zxid == proposedZxid) && (id > proposedLeader)))
return true;
else
return false;
}
如果能选出,则更新候选人为收到的选票中的候选者,
如果不能选出,则推荐自己,然后发给其他服务器。
2 如果小于: 说明对方落后,只需将自己的消息发送出去。
3 如果等于:且能够根据目前的选票选出leader(如果能选出,该候选者一定是收到选票的候选人),则更新候选者并发送信息给其他服务器。