一、选举算法中的概念说明
//ZK服务状态
public enum ServerState {
//代表没有当前集群中没有Leader,此时是投票选举状态
LOOKING,
//代表已经是伴随者状态
FOLLOWING,
//代表已经是领导者状态
LEADING,
//代表已经是观察者状态(观察者不参与投票过程)
OBSERVING
}
//Learner 是随从服务和观察者的统称
public enum LearnerType {
//随从者角色
PARTICIPANT,
//观察者角色
OBSERVER
}
//org.apache.zookeeper.server.quorum.FastLeaderElection.Notification
//用于投票过程消息广播
public static class Notification {
int version;
//被推荐leader的ID
long leader;
//被推荐leader的zxid
long zxid;
//投票轮次
long electionEpoch;
//当前投票者的服务状态 (LOOKING)
QuorumPeer.ServerState state;
//当前投票者的ID
long sid;
//QuorumVerifier作为集群验证器,主要完成判断一组server在
//已给定的配置的server列表中,是否能够构成集群
QuorumVerifier qv;
//被推荐leader的投票轮次
long peerEpoch;
}
//org.apache.zookeeper.server.quorum.FastLeaderElection.Notification
/**
* Messages that a peer wants to send to other peers.
* These messages can be both Notifications and Acks
* of reception of notification.
* 用于消息的发送
*/
public static class ToSend {
//支持的消息类型
enum mType {
crequest, //请求
challenge, //确认
notification,//通知
ack //确认回执
}
ToSend(mType type, long leader, long zxid, long electionEpoch, ServerState state, long sid, long peerEpoch, byte[] configData) {
this.leader = leader;
this.zxid = zxid;
this.electionEpoch = electionEpoch;
this.state = state;
this.sid = sid;
this.peerEpoch = peerEpoch;
this.configData = configData;
}
/*
* Proposed leader in the case of notification
* 被投票推举为leader的服务ID
*/ long leader;
/*
* id contains the tag for acks, and zxid for notifications
*
*/ long zxid;
/*
* Epoch
* 投票轮次
*/ long electionEpoch;
/*
* Current state;
* 服务状态
*/ QuorumPeer.ServerState state;
/*
* Address of recipient
* 消息接收方服务ID
*/ long sid;
/*
* Used to send a QuorumVerifier (configuration info)
*/ byte[] configData = dummyData;
/*
* Leader epoch
*/ long peerEpoch;
}
//org.apache.zookeeper.server.quorum.Vote
//选票模型
public class Vote {
//投票版本号,作为一个标识
private final int version;
//当前服务的ID
private final long id;
//当前服务事务ID
private final long zxid;
//当前服务投票的轮次
private final long electionEpoch;
//被推举服务器的投票轮次
private final long peerEpoch;
//当前服务器所处的状态
private final ServerState state;
}
二、选举算法过程
/**
* org.apache.zookeeper.server.quorum.FastLeaderElection#lookForLeader
* Starts a new round of leader election. Whenever our QuorumPeer
* changes its state to LOOKING, this method is invoked, and it
* sends notifications to all other peers.
*/
public Vote lookForLeader() throws InterruptedException {
// 部分代码略.... ....
try {
//接收的投票票池
Map<Long, Vote> recvset = new HashMap<Long, Vote>();
//对外的投票记录
Map<Long, Vote> outofelection = new HashMap<Long, Vote>();
int notTimeout = minNotificationInterval;
synchronized (this) {
//投票轮次自增
logicalclock.incrementAndGet();
//首次推举自己为leader
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
LOG.info("New election. My id = " + self.getId() + ", proposed zxid=0x" + Long.toHexString(proposedZxid));
//默认选举自己,并广播发出投票
sendNotifications();
SyncedLearnerTracker voteSet;
/*
* Loop in which we exchange notifications until we find a leader
*/
//如果当前server状态依然是LOOKING状态,且未选出leader则直到找到为止
while ((self.getPeerState() == ServerState.LOOKING) && (!stop)) {
/*
* Remove next notification from queue, times out after 2 times
* the termination time
*/
//准备接收发来的投票,该过程属于阻塞过程,直到本次阻塞超时,一次取一个
Notification n = recvqueue.poll(notTimeout, TimeUnit.MILLISECONDS);
/*
* Sends more notifications if haven't received enough.
* Otherwise processes new notification.
*/
if (n == null) {
//如果收到回应则发送
if (manager.haveDelivered()) {
//开始广播自己的投票信息
sendNotifications();
} else {
//确保和其他所有服务连接可用,
manager.connectAll();
}
/*
* Exponential backoff
* 延长从队列获取选票时长
*/
int tmpTimeOut = notTimeout * 2;
notTimeout = (tmpTimeOut < maxNotificationInterval ? tmpTimeOut : maxNotificationInterval);
LOG.info("Notification time out: " + notTimeout);
//确定当前接收的选票的选举人n是否属于LOOKING(验明身份,的确是合法参与者)
} else if (validVoter(n.sid) && validVoter(n.leader)) {
/*
* Only proceed if the vote comes from a replica in the current or next
* voting view for a replica in the current or next voting view.
*/
switch (n.state) {
//如果当前选举人是LOOKING状态
case LOOKING:
// If notification > current, replace and send messages out
//如果收到服务器轮次的大于自己的,则将自己的轮次设置成最新的,将自己的投票池清空
if (n.electionEpoch > logicalclock.get()) {
logicalclock.set(n.electionEpoch);
recvset.clear();
//进行选票PK,如果自己的票没有PK过其他投递的票,则将自己的票变更为其他
if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
} else {
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
//重新发出投票
sendNotifications();
//如果收到的轮次小于自己的轮次,不做处理,n的投票无效
} else if (n.electionEpoch < logicalclock.get()) {
if (LOG.isDebugEnabled()) {
LOG.debug(
"Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x" + Long.toHexString(n.electionEpoch)
+ ", logicalclock=0x" + Long.toHexString(logicalclock.get()));
}
break;
//如果收到的轮次等于自己的轮次,则进行选票PK,如果自己的票没有PK过其他投递的票,则将自己的票变更为其他
} else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
//重新发出投票
sendNotifications();
}
// don't care about the version if it's in LOOKING state
//将获取到的投票数据放入自己的票池中
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
//统计选票
voteSet = getVoteTracker(recvset, new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch));
//判断是否有超过半数的票数是指向同一个服务ID
if (voteSet.hasAllQuorums()) {
//如果此刻在票池汇总还有未取出的投票,则和选举出的投票PK,如果取出的票优于当前推举的投票,则重新投票
// Verify if there is any change in the proposed leader
while ((n = recvqueue.poll(finalizeWait, TimeUnit.MILLISECONDS)) != null) {
if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) {
recvqueue.put(n);
break;
}
}
/*
* This predicate is true once we don't read any new
* relevant message from the reception queue
*/
//如果票池中没有可PK的投票,则就认为选举出来的服务为leader
if (n == null) {
//修改各个服务的状态,
setPeerState(proposedLeader, voteSet);
Vote endVote = new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch);
//清除投票池
leaveInstance(endVote);
return endVote;
}
}
break;
case OBSERVING:
LOG.debug("Notification from observer: {}", n.sid);
break;
case FOLLOWING:
case LEADING://
/*
* Consider all notifications from the same epoch
* together.
*/
//如果新收到的选票发送者角色是leader角色状态且选票轮次和自己的选票轮次一样
if (n.electionEpoch == logicalclock.get()) {
//则将leader角色投递的这张选票放入自己的选票池中
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
//统计选票
voteSet = getVoteTracker(recvset, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state));
//判断是否有超过半数的票数是推荐了n推荐的leader且n.leader也确实是LEADING状态
if (voteSet.hasAllQuorums() && checkLeader(outofelection, n.leader, n.electionEpoch)) {
//则指定n推荐的为真正的leader同时修改其他服务对应的状态
setPeerState(n.leader, voteSet);
Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch);
//清空票池
leaveInstance(endVote);
return endVote;
}
}
/*
* Before joining an established ensemble, verify that
* a majority are following the same leader.
*/
//如果轮次不一致,则将N的投票记录到outofelection中
outofelection.put(n.sid, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state));
voteSet = getVoteTracker(outofelection, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state));
//判断是否有超过半数的票数是推荐了n推荐的leader且n.leader也确实是LEADING状态
if (voteSet.hasAllQuorums() && checkLeader(outofelection, n.leader, n.electionEpoch)) {
synchronized (this) {
//更新当前服务选举轮次
logicalclock.set(n.electionEpoch);
//则指定n推荐的为真正的leader同时修改其他服务对应的状态
setPeerState(n.leader, voteSet);
}
Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch);
//清空票池
leaveInstance(endVote);
return endVote;
}
break;
default:
LOG.warn("Notification state unrecoginized: " + n.state + " (n.state), " + n.sid + " (n.sid)");
break;
}
} else {
if (!validVoter(n.leader)) {
LOG.warn("Ignoring notification for non-cluster member sid {} from sid {}", n.leader, n.sid);
}
if (!validVoter(n.sid)) {
LOG.warn("Ignoring notification for sid {} from non-quorum member sid {}", n.leader, n.sid);
}
}
}
return null;
} finally {
try {
if (self.jmxLeaderElectionBean != null) {
MBeanRegistry.getInstance().unregister(self.jmxLeaderElectionBean);
}
} catch (Exception e) {
LOG.warn("Failed to unregister with JMX", e);
}
self.jmxLeaderElectionBean = null;
LOG.debug("Number of connection processing threads: {}", manager.getConnectionThreadCount());
}
}
/**
* org.apache.zookeeper.server.quorum.FastLeaderElection#sendNotifications
* Send notifications to all peers upon a change in our vote
*/
private void sendNotifications() {
for (long sid : self.getCurrentAndNextConfigVoters()) {
QuorumVerifier qv = self.getQuorumVerifier();
ToSend notmsg = new ToSend(
ToSend.mType.notification,
proposedLeader,
proposedZxid,
logicalclock.get(),
QuorumPeer.ServerState.LOOKING,
sid,
proposedEpoch,
qv.toString().getBytes());
if (LOG.isDebugEnabled()) {
LOG.debug("Sending Notification: " + proposedLeader
+ " (n.leader), 0x" + Long.toHexString(proposedZxid)
+ " (n.zxid), 0x" + Long.toHexString(logicalclock.get())
+ " (n.round), " + sid
+ " (recipient), " + self.getId()
+ " (myid), 0x" + Long.toHexString(proposedEpoch)
+ " (n.peerEpoch)");
}
sendqueue.offer(notmsg);
}
}
//org.apache.zookeeper.server.quorum.QuorumPeer#getCurrentAndNextConfigVoters
public synchronized Set<Long> getCurrentAndNextConfigVoters() {
Set<Long> voterIds = new HashSet<Long>(getQuorumVerifier().getVotingMembers().keySet());
if (getLastSeenQuorumVerifier() != null) {
voterIds.addAll(getLastSeenQuorumVerifier().getVotingMembers().keySet());
}
return voterIds;
}
org.apache.zookeeper.server.quorum.FastLeaderElection#totalOrderPredicate(选票PK重要函数)
/**
* Check if a pair (server id, zxid) succeeds our
* current vote.
* 选票PK
*/
protected boolean totalOrderPredicate(long newId, long newZxid, long newEpoch, long curId, long curZxid, long curEpoch) {
if (LOG.isDebugEnabled()) {
LOG.debug("id: " + newId
+ ", proposed id: " + curId
+ ", zxid: 0x" + Long.toHexString(newZxid)
+ ", proposed zxid: 0x" + Long.toHexString(curZxid));
}
if (self.getQuorumVerifier().getWeight(newId) == 0) {
return false;
}
/*
* We return true if one of the following three cases hold:
* 1- New epoch is higher
* 2- New epoch is the same as current epoch, but new zxid is higher
* 3- New epoch is the same as current epoch, new zxid is the same
* as current zxid, but server id is higher.
*/
//这里是重点了,先判断Epoch,如果收到的投票Epoch高于自己的Epoch则返回true,如果相等则判断zxid,
//如果收到的zxid大于自己则返回true,如果zxid也相等则判断 Id ,如果收到的投票ID大于自己的ID返回true否则返回false
return ((newEpoch > curEpoch)
|| ((newEpoch == curEpoch)
&& ((newZxid > curZxid)
|| ((newZxid == curZxid)
&& (newId > curId)))));
}
org.apache.zookeeper.server.quorum.SyncedLearnerTracker#hasAllQuorums(投票过半的逻辑判断函数)
//投票过半的判断函数
public boolean hasAllQuorums() {
for (QuorumVerifierAcksetPair qvAckset : qvAcksetPairs) {
//这里是真正的算法过程
if (!qvAckset.getQuorumVerifier().containsQuorum(qvAckset.getAckset())) {
return false;
}
}
return true;
}
//org.apache.zookeeper.server.quorum.flexible.QuorumMaj#containsQuorum
private int half;
/**
* Defines a majority to avoid computing it every time.
*
*/
public QuorumMaj(Map<Long, QuorumServer> allMembers) {
this.allMembers = allMembers;
for (QuorumServer qs : allMembers.values()) {
//获取所有角色为PARTICIPANT的成员,Observer不参与投票,所以在过半计算中不计入
if (qs.type == LearnerType.PARTICIPANT) {
votingMembers.put(Long.valueOf(qs.id), qs);
} else {
observingMembers.put(Long.valueOf(qs.id), qs);
}
}
half = votingMembers.size() / 2;
}
/**
* Verifies if a set is a majority. Assumes that ackSet contains acks only
* from votingMembers
*/
public boolean containsQuorum(Set<Long> ackSet) {
return (ackSet.size() > half);
}
三、不同场景下的选举过程图示
自增选举轮次
ZooKeeper规定所有有效的投票都必须在同一轮次中。每个服务器在开始新一轮投票时,会先对自己维护的logicClock进行自增操作。
初始化选票
每个服务器在广播自己的选票前,会将自己的投票箱清空。该投票箱记录了所收到的选票。例:服务器2投票给服务器3,服务器3投票给服务器1,则服务器1的投票箱为(2, 3), (3, 1), (1, 1)。票箱中只会记录每一投票者的最后一票,如投票者更新自己的选票,则其它服务器收到该新选票后会在自己票箱中更新该服务器的选票。
发送初始化选票
每个服务器最开始都是通过广播把票投给自己。
接收外部投票
服务器会尝试从其它服务器获取投票,并记入自己的投票箱内。如果无法获取任何外部投票,则会确认自己是否与集群中其它服务器保持着有效连接。如果是,则再次发送自己的投票;如果否,则马上与之建立连接。
判断选举轮次
收到外部投票后,首先会根据投票信息中所包含的logicClock来进行不同处理:
外部投票的logicClock大于自己的logicClock。说明该服务器的选举轮次落后于其它服务器的选举轮次,立即清空自己的投票箱并将自己的logicClock更新为收到的logicClock,然后再对比自己之前的投票与收到的投票以确定是否需要变更自己的投票,最终再次将自己的投票广播出去。
外部投票的logicClock小于自己的logicClock。当前服务器直接忽略该投票,继续处理下一个投票。
外部投票的logickClock与自己的相等。当时进行选票PK。
选票PK
选票PK是基于(self_id, self_zxid)与(vote_id, vote_zxid)的对比:
外部投票的logicClock大于自己的logicClock,则将自己的logicClock及自己的选票的logicClock变更为收到的logicClock
若logicClock一致,则对比二者的vote_zxid,若外部投票的vote_zxid比较大,则将自己的票中的vote_zxid与vote_myid更新为收到的票中的vote_zxid与vote_myid并广播出去,另外将收到的票及自己更新后的票放入自己的票箱。如果票箱内已存在(self_myid, self_zxid)相同的选票,则直接覆盖
若二者vote_zxid一致,则比较二者的vote_myid,若外部投票的vote_myid比较大,则将自己的票中的vote_myid更新为收到的票中的vote_myid并广播出去,另外将收到的票及自己更新后的票放入自己的票箱
统计选票
如果已经确定有过半服务器认可了自己的投票(可能是更新后的投票),则终止投票。否则继续接收其它服务器的投票。
更新服务器状态
投票终止后,服务器开始更新自身状态。若过半的票投给了自己,则将自己的服务器状态更新为LEADING,否则将自己的状态更新为FOLLOWING。
参考:
https://blog.csdn.net/qq_41724691/article/details/84619465