源码中关于Time.currentElapsedTime()的内部解析部分可以参考博客另一篇文章
ZKcurrentElapsedTime
/**
* Starts a new round of leader election. Whenever our QuorumPeer
* changes its state to LOOKING, this method is invoked, and it
* sends notifications to all other peers.
*/
public Vote lookForLeader() throws InterruptedException {
try {
self.jmxLeaderElectionBean = new LeaderElectionBean();
MBeanRegistry.getInstance().register(
self.jmxLeaderElectionBean, self.jmxLocalPeerBean);
} catch (Exception e) {
LOG.warn("Failed to register with JMX", e);
self.jmxLeaderElectionBean = null;
}
//开始选举的时间,如果为0则调用currentElapsedTime得到某个时间
if (self.start_fle == 0) {
self.start_fle = Time.currentElapsedTime();
}
try {
//存放接收到的选举票
HashMap<Long, Vote> recvset = new HashMap<Long, Vote>();
//存放投票结果
HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>();
//系统超市时间,默认是200毫微秒
int notTimeout = finalizeWait;
synchronized(this){
//逻辑始终自动加一,属于原子型操作
logicalclock.incrementAndGet();
//更新此次选举协议
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
LOG.info("New election. My id = " + self.getId() +
", proposed zxid=0x" + Long.toHexString(proposedZxid));
//发给所有的服务器节点
sendNotifications();
/*
* Loop in which we exchange notifications until we find a leader
*/
// 如果状态为looking,进入循环,知道选举到leader,除非中途所有节点挂掉
while ((self.getPeerState() == ServerState.LOOKING) &&
(!stop)){
/*
* Remove next notification from queue, times out after 2 times
* the termination time
*/
//从队列中获取获取投票信息,如果二次未取到,则超时关闭
Notification n = recvqueue.poll(notTimeout,
TimeUnit.MILLISECONDS);
/*
* Sends more notifications if haven't received enough.
* Otherwise processes new notification.
*/
//如果从队列中没有获取到票,那么则通知所有节点再次投票
if(n == null){
if(manager.haveDelivered()){
sendNotifications();
} else {
manager.connectAll();
}
/*
* Exponential backoff
*/
//更改超时时间,当规定的超时时间的2倍大于最大通知间隔时间后,则会把最大通知时间间隔赋给超时时间
//主要是增加从队列提取票的成功率
int tmpTimeOut = notTimeout*2;
notTimeout = (tmpTimeOut < maxNotificationInterval?
tmpTimeOut : maxNotificationInterval);
LOG.info("Notification time out: " + notTimeout);
}
//如果队列当中有投票信息,则检查票是否合法
else if(validVoter(n.sid) && validVoter(n.leader)) {
/*
* Only proceed if the vote comes from a replica in the
* voting view for a replica in the voting view.
*/
switch (n.state) {
case LOOKING:
// If notification > current, replace and send messages out
//判断收到的epoch是不是比上一次选举的大,
if (n.electionEpoch > logicalclock.get()) {
//如果大,则epoch将被设置成最大的eopoch
logicalclock.set(n.electionEpoch);
//清空队列
recvset.clear();
/*
totalOrderPredicate:
* We return true if one of the following three cases hold:
* 1- New epoch is higher
* 2- New epoch is the same as current epoch, but new zxid is higher
* 3- New epoch is the same as current epoch, new zxid is the same
* as current zxid, but server id is higher.
选举规则:
1.收到epoch大于当前的epoch,则直接胜出
2.epoch等于当前的,但是zxid是最大的,胜出
3,如果epoch,zxid都相同,则比较sid,sid最大胜出
return ((newEpoch > curEpoch) ||
((newEpoch == curEpoch) &&
((newZxid > curZxid) || ((newZxid == curZxid) && (newId > curId)))));
*/
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
} else {//如果没有选出leader,则把此次选举情况进行更新,并发送给所有节点
updateProposal(getInitId(),
getInitLastLoggedZxid(),
getPeerEpoch());
}
sendNotifications();
//如果收到的epoch小于上一次的,那么报出异常,此次选举无效,终止选举
} else if (n.electionEpoch < logicalclock.get()) {
if(LOG.isDebugEnabled()){
LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"
+ Long.toHexString(n.electionEpoch)
+ ", logicalclock=0x" + Long.toHexString(logicalclock.get()));
}
break;
//收到的epoch和之前的相等,则再次比较myid,sid,进行选举
} else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
sendNotifications();
}
if(LOG.isDebugEnabled()){
LOG.debug("Adding vote: from=" + n.sid +
", proposed leader=" + n.leader +
", proposed zxid=0x" + Long.toHexString(n.zxid) +
", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
}
//此次选举有效的话,添加此次选举协议到队列中
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
//判断选举是否结束,默认是一半以上
if (termPredicate(recvset,
new Vote(proposedLeader, proposedZxid,
logicalclock.get(), proposedEpoch))) {
// Verify if there is any change in the proposed leader
while((n = recvqueue.poll(finalizeWait,
TimeUnit.MILLISECONDS)) != null){
/ if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)){
recvqueue.put(n);/
break;
}
}
/*
* This predicate is true once we don't read any new
* relevant message from the reception queue
*/
//如果没有读取到如何票,直接返回当前存在的票信息
if (n == null) {
self.setPeerState((proposedLeader == self.getId()) ?
ServerState.LEADING: learningState());
Vote endVote = new Vote(proposedLeader,
proposedZxid,
logicalclock.get(),
proposedEpoch);
leaveInstance(endVote);
return endVote;
}
}
break;
case OBSERVING:
LOG.debug("Notification from observer: " + n.sid);
break;
case FOLLOWING:
case LEADING:
/*
* Consider all notifications from the same epoch
* together.
*/
if(n.electionEpoch == logicalclock.get()){
recvset.put(n.sid, new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch));
if(ooePredicate(recvset, outofelection, n)) {
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
Vote endVote = new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
}
/*
* Before joining an established ensemble, verify
* a majority is following the same leader.
*/
outofelection.put(n.sid, new Vote(n.version,
n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch,
n.state));
if(ooePredicate(outofelection, outofelection, n)) {
synchronized(this){
logicalclock.set(n.electionEpoch);
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
}
Vote endVote = new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
break;
default:
LOG.warn("Notification state unrecognized: {} (n.state), {} (n.sid)",
n.state, n.sid);
break;
}
} else {
if (!validVoter(n.leader)) {
LOG.warn("Ignoring notification for non-cluster member sid {} from sid {}", n.leader, n.sid);
}
if (!validVoter(n.sid)) {
LOG.warn("Ignoring notification for sid {} from non-quorum member sid {}", n.leader, n.sid);
}
}
}
return null;
} finally {
try {
if(self.jmxLeaderElectionBean != null){
MBeanRegistry.getInstance().unregister(
self.jmxLeaderElectionBean);
}
} catch (Exception e) {
LOG.warn("Failed to unregister with JMX", e);
}
self.jmxLeaderElectionBean = null;
LOG.debug("Number of connection processing threads: {}",
manager.getConnectionThreadCount());
}
}
在源码808行
updateProposal方法主要是更新投票信息
synchronized(this){
logicalclock.incrementAndGet();
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
updateProposal方法:
synchronized void updateProposal(long leader, long zxid, long epoch){
if(LOG.isDebugEnabled()){
LOG.debug("Updating proposal: " + leader + " (newleader), 0x"
+ Long.toHexString(zxid) + " (newzxid), " + proposedLeader
+ " (oldleader), 0x" + Long.toHexString(proposedZxid) + " (oldzxid)");
}
proposedLeader = leader;
proposedZxid = zxid;
proposedEpoch = epoch;
}
updateProposal方法传入的三个参数的方法如下:
三个方法作用分别是,得到myid,得到最新的zid,得到当前的逻辑时钟,前提是参与者状态的时候,否则返回long最小值
private long getInitId(){
if(self.getLearnerType() == LearnerType.PARTICIPANT)
return self.getId();
else return Long.MIN_VALUE;
}
private long getInitLastLoggedZxid(){
if(self.getLearnerType() == LearnerType.PARTICIPANT)
return self.getLastLoggedZxid();
else return Long.MIN_VALUE;
}
private long getPeerEpoch(){
if(self.getLearnerType() == LearnerType.PARTICIPANT)
try {
return self.getCurrentEpoch();
} catch(IOException e) {
RuntimeException re = new RuntimeException(e.getMessage());
re.setStackTrace(e.getStackTrace());
throw re;
}
else return Long.MIN_VALUE;
}
源码572行,sendNotifications
通过整个投票视图,得到所有的节点id,把投票信息封装给一个ToSend并添加到sendqueue队列,之后在某些时候ZK将发送给所有节点
我们可以看到投票信息发给所有节点的过程,是把投票信息封装给一个消息,之后将消息放到队列当中,zk择机会发送给所有节点
LinkedBlockingQueue<ToSend> sendqueue;
private void sendNotifications() {
for (QuorumServer server : self.getVotingView().values()) {
long sid = server.id;
ToSend notmsg = new ToSend(ToSend.mType.notification,
proposedLeader,
proposedZxid,
logicalclock.get(),
QuorumPeer.ServerState.LOOKING,
sid,
proposedEpoch);
if(LOG.isDebugEnabled()){
LOG.debug("Sending Notification: " + proposedLeader + " (n.leader), 0x" +
Long.toHexString(proposedZxid) + " (n.zxid), 0x" + Long.toHexString(logicalclock.get()) +
" (n.round), " + sid + " (recipient), " + self.getId() +
" (myid), 0x" + Long.toHexString(proposedEpoch) + " (n.peerEpoch)");
}
sendqueue.offer(notmsg);
}
}
在827行有此方法
主要作用是从recvqueue队列取出投票信息,交给Notification
注意这个队列去除方法有个超时时间单位是毫微秒,如果取的次数超过两次,则取出失败
Notification n = recvqueue.poll(notTimeout,
TimeUnit.MILLISECONDS);
这个是NoTification静态内部类,主要作用是将接收到的投票信息
让其他节点知道这个投票信息已经被改变了,
改变投票的三种方式:
已经加入到leader选举
有一个最大的zxid,
zxid相同有一个最大的sid
static public class Notification {
/*
* Format version, introduced in 3.4.6
*/
public final static int CURRENTVERSION = 0x1;
int version;
/*
* Proposed leader
*/
long leader;
/*
* zxid of the proposed leader
*/
long zxid;
/*
* Epoch
*/
long electionEpoch;
/*
* current state of sender
*/
QuorumPeer.ServerState state;
/*
* Address of sender
*/
long sid;
/*
* epoch of the proposed leader
*/
long peerEpoch;
@Override
public String toString() {
return Long.toHexString(version) + " (message format version), "
+ leader + " (n.leader), 0x"
+ Long.toHexString(zxid) + " (n.zxid), 0x"
+ Long.toHexString(electionEpoch) + " (n.round), " + state
+ " (n.state), " + sid + " (n.sid), 0x"
+ Long.toHexString(peerEpoch) + " (n.peerEpoch) ";
}
}
在源码891行有termPredicate方法
传入的参数是该接收到的投票信息,以及一个Vote对象
主要是遍历已经收到的投票结果是否有等于当前投票提议的。如果有把当前投票的节点sid放入到set集合中
最后判断是否过半
protected boolean termPredicate(
HashMap<Long, Vote> votes,
Vote vote) {
HashSet<Long> set = new HashSet<Long>();
/*
* First make the views consistent. Sometimes peers will have
* different zxids for a server depending on timing.
*/
for (Map.Entry<Long,Vote> entry : votes.entrySet()) {
if (vote.equals(entry.getValue())){
set.add(entry.getKey());
}
}
//判断是否过半
return self.getQuorumVerifier().containsQuorum(set);
}
判断是否过半的containsQuorum是接口QuorumVerifier的一个方法
QuorumVerifier接口有两个实现类,我们看一下QuorumMaj类
这段代码就不用解释了,太简单了
public class QuorumMaj implements QuorumVerifier {
private static final Logger LOG = LoggerFactory.getLogger(QuorumMaj.class);
int half;
/**
* Defines a majority to avoid computing it every time.
*
* @param n number of servers
*/
public QuorumMaj(int n){
this.half = n/2;
}
/**
* Returns weight of 1 by default.
*
* @param id
*/
public long getWeight(long id){
return (long) 1;
}
/**
* Verifies if a set is a majority.
*/
public boolean containsQuorum(Set<Long> set){
return (set.size() > half);
}
}
最后总结说一下启动时选举流程:
zk选举算法:
1、处理投票
收到的epoch大于当前的epoch 胜出
如果收到的epoch等于当前epoch,那么收到的zxid大于当前zxid胜出
如果收到的epoch和zxid都相等,那么收到的myid大于当前myid的胜出
2、选出胜者
票数者胜出选举