选举中的角色和状态
- LOOKING:投票状态
- FOLLOWING: 跟随主的状态
- LEADING: 主的状态
- OBSERVING: 观察状态。observer没有权利竞选和投票,只能观察
投票过程中的概念
- 外部投票: Notification,其他服务器发过来的投票
- 内部投票: Proposal,每台服务器自己当前的投票
- PK: totalOrderPredicate()方法,对内部投票和外部投票进行比对,比对逻辑是:
- 首先比较投票轮次,也就是epoch,选择大的
- 如果epoch相同,则比较事务id,也就是zxid
- zxid也相同,则比较服务器id,也就是myid
选票中的信息
-
leader:选出的leader的id
-
zxid:选出的leader的zxid(事务id)
-
electionEpoch:本次选举过程中投票的次数,每次投票后自增。同logicalclock
-
QuorumPeer.ServerState:服务器状态,LOOKING,FOLLOWING,LEADING
-
sid:发出这个投票的服务器id,Myid
-
peerEpoch: 选出的leader的Epoch,Epoch是选举的次数
投票过程
发生选举的两种情况
-
zk集群初始化启动的时候。
简单描述:假如有A,B,C三台服务器,他们的myid分别是1,2,3。依次启动A,B,C。
1.当A启动后,A是LOOKING状态并且先投自己,并把自己的投票广播给其他服务器。
2.当B启动后,B也是LOOKING状态并且先投自己,广播自己的投票,然后从recvqueue中拿到A的票进行比对。A,B的epoch都是1,zxid相同,B的id比A大,所以B胜出。在B这台机器上,AB都投了B,超过了3/2的半数,B成为leader,状态改为LEADING。
3.A从recvqueue取出B的投票信息进行对比,对比逻辑同上,B胜出。在A这里,AB都选B,B是leader。A成为follower,转态改为FOLLOWING。
4.C一启动,状态是LOOKING,C先投了自己一票,然后广播了自己的投票,当C收到其他服务器返回的投票时,发现其他服务器的状态是LEADING/FOLLOWING。C发现已经有leader了,C的转态改成FOLLOWING。
-
zk集群运行中,leader不可用时。
简单描述:假如有A,B,C,D四台服务器,他们的epoch,zxid,myid分别是(1,7,1), (1,8,2), (1,7,3), (1,8,4),D是leader。D在运行中挂掉了。
1.A,B,C的状态从FOLLOWING变为LOOKING。
2.A,B,C首先先投自己一票,然后把自己的投票广播给其他服务器。
3.A,B,C互相收到了对方发来的投票,经过对比,最终ABC分别都确定了B为leader,然后B的状态改为LEADING,AC分别都改成FOLLOWING。
ZooKeeper 3.6.1选举过程源码
省略了日志和相对不重要的代码
/**
* Starts a new round of leader election. Whenever our QuorumPeer
* changes its state to LOOKING, this method is invoked, and it
* sends notifications to all other peers.
*/
public Vote lookForLeader() throws InterruptedException {
try {
/*
* The votes from the current leader election are stored in recvset. In other words, a vote v is in recvset
* if v.electionEpoch == logicalclock. The current participant uses recvset to deduce on whether a majority
* of participants has voted for it.
*/
Map<Long, Vote> recvset = new HashMap<Long, Vote>();
/*
* The votes from previous leader elections, as well as the votes from the current leader election are
* stored in outofelection. Note that notifications in a LOOKING state are not stored in outofelection.
* Only FOLLOWING or LEADING notifications are stored in outofelection. The current participant could use
* outofelection to learn which participant is the leader if it arrives late (i.e., higher logicalclock than
* the electionEpoch of the received notifications) in a leader election.
*/
Map<Long, Vote> outofelection = new HashMap<Long, Vote>();
int notTimeout = minNotificationInterval;
synchronized (this) {
logicalclock.incrementAndGet();
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
sendNotifications();
SyncedLearnerTracker voteSet;
/*
* Loop in which we exchange notifications until we find a leader
*/
while ((self.getPeerState() == ServerState.LOOKING) && (!stop)) {
/*
* Remove next notification from queue, times out after 2 times
* the termination time
*/
Notification n = recvqueue.poll(notTimeout, TimeUnit.MILLISECONDS);
/*
* Sends more notifications if haven't received enough.
* Otherwise processes new notification.
*/
if (n == null) {
if (manager.haveDelivered()) {
sendNotifications();
} else {
manager.connectAll();
}
/*
* Exponential backoff
*/
int tmpTimeOut = notTimeout * 2;
notTimeout = Math.min(tmpTimeOut, maxNotificationInterval);
LOG.info("Notification time out: {}", notTimeout);
} else if (validVoter(n.sid) && validVoter(n.leader)) {
/*
* Only proceed if the vote comes from a replica in the current or next
* voting view for a replica in the current or next voting view.
*/
switch (n.state) {
case LOOKING:
if (getInitLastLoggedZxid() == -1) {
break;
}
if (n.zxid == -1) {
break;
}
// If notification > current, replace and send messages out
if (n.electionEpoch > logicalclock.get()) {
logicalclock.set(n.electionEpoch);
recvset.clear();
if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
} else {
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
sendNotifications();
} else if (n.electionEpoch < logicalclock.get()) {
break;
} else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
sendNotifications();
}
// don't care about the version if it's in LOOKING state
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
voteSet = getVoteTracker(recvset, new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch));
if (voteSet.hasAllQuorums()) {
// Verify if there is any change in the proposed leader
while ((n = recvqueue.poll(finalizeWait, TimeUnit.MILLISECONDS)) != null) {
if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) {
recvqueue.put(n);
break;
}
}
/*
* This predicate is true once we don't read any new
* relevant message from the reception queue
*/
if (n == null) {
setPeerState(proposedLeader, voteSet);
Vote endVote = new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch);
leaveInstance(endVote);
return endVote;
}
}
break;
case OBSERVING:
break;
case FOLLOWING:
case LEADING:
/*
* Consider all notifications from the same epoch
* together.
*/
if (n.electionEpoch == logicalclock.get()) {
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state));
voteSet = getVoteTracker(recvset, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state));
if (voteSet.hasAllQuorums() && checkLeader(recvset, n.leader, n.electionEpoch)) {
setPeerState(n.leader, voteSet);
Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
}
/*
* Before joining an established ensemble, verify that
* a majority are following the same leader.
*
* Note that the outofelection map also stores votes from the current leader election.
* See ZOOKEEPER-1732 for more information.
*/
outofelection.put(n.sid, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state));
voteSet = getVoteTracker(outofelection, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state));
if (voteSet.hasAllQuorums() && checkLeader(outofelection, n.leader, n.electionEpoch)) {
synchronized (this) {
logicalclock.set(n.electionEpoch);
setPeerState(n.leader, voteSet);
}
Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
break;
default:
break;
}
} else {
if (!validVoter(n.leader)) {
}
if (!validVoter(n.sid)) {
}
}
}
return null;
} finally {
}
}
/**
* Notifications are messages that let other peers know that
* a given peer has changed its vote, either because it has
* joined leader election or because it learned of another
* peer with higher zxid or same zxid and higher server id
*/
public static class Notification {
/*
* Format version, introduced in 3.4.6
*/
public static final int CURRENTVERSION = 0x2;
int version;
/*
* Proposed leader
*/ long leader;
/*
* zxid of the proposed leader
*/ long zxid;
/*
* Epoch
*/ long electionEpoch;
/*
* current state of sender
*/ QuorumPeer.ServerState state;
/*
* Address of sender
*/ long sid;
QuorumVerifier qv;
/*
* epoch of the proposed leader
*/ long peerEpoch;
}
/**
* Check if a pair (server id, zxid) succeeds our
* current vote.
*
*/
protected boolean totalOrderPredicate(long newId, long newZxid, long newEpoch, long curId, long curZxid, long curEpoch) {
LOG.debug(
"id: {}, proposed id: {}, zxid: 0x{}, proposed zxid: 0x{}",
newId,
curId,
Long.toHexString(newZxid),
Long.toHexString(curZxid));
if (self.getQuorumVerifier().getWeight(newId) == 0) {
return false;
}
/*
* We return true if one of the following three cases hold:
* 1- New epoch is higher
* 2- New epoch is the same as current epoch, but new zxid is higher
* 3- New epoch is the same as current epoch, new zxid is the same
* as current zxid, but server id is higher.
*/
return ((newEpoch > curEpoch)
|| ((newEpoch == curEpoch)
&& ((newZxid > curZxid)
|| ((newZxid == curZxid)
&& (newId > curId)))));
}