zookeeper选举算法源码
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.zookeeper.server.quorum;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.zookeeper.common.Time;
import org.apache.zookeeper.jmx.MBeanRegistry;
import org.apache.zookeeper.server.ZooKeeperThread;
import org.apache.zookeeper.server.quorum.QuorumCnxManager.Message;
import org.apache.zookeeper.server.quorum.QuorumPeer.LearnerType;
import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
import org.apache.zookeeper.server.quorum.QuorumPeer.ServerState;
import org.apache.zookeeper.server.util.ZxidUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Implementation of leader election using TCP. It uses an object of the class
* QuorumCnxManager to manage connections. Otherwise, the algorithm is push-based
* as with the other UDP implementations.
*
* There are a few parameters that can be tuned to change its behavior. First,
* finalizeWait determines the amount of time to wait until deciding upon a leader.
* This is part of the leader election algorithm.
*
* 使用TCP实现leader领导人选举,它使用QuorumCnxManager类的一个对象管理连接(与其他server间的连接管理)。
* 否则(如果不使用QuorumCnxManager对象的话),将使用UDP基于推送的算法实现。
*
* 有几个参数可以用来改变它(选举)的行为。
* 首先,finalizeWait(一个代码中的常量)决定选举一个leader的时间,
* 这是leader选举算法的一部分。
*/
public class FastLeaderElection implements Election {
private static final Logger LOG = LoggerFactory.getLogger(FastLeaderElection.class);
/**
* Determine how much time a process has to wait
* once it believes that it has reached the end of
* leader election.
* (此常量)决定一个选举过程需要等待的选举时间;
* 一经到达,将结束leader选举。
* 默认200毫秒
* 实际此时间是节点等待收取其他节点选举消息(Notification)的时间;
*/
final static int finalizeWait = 200;
/**
* Upper bound on the amount of time between two consecutive
* notification checks. This impacts the amount of time to get
* the system up again after long partitions. Currently 60 seconds.
* (此常量)指定两个连续的notification检查的时间间隔上线;
* 其影响了系统在经历了长时间分割后再次重启的时间,默认60秒。
* 此常量其实就是finalizeWait的最大值,代表如果在60秒内还没有选举出leader,
* 那么重新发起一轮选举;
*/
final static int maxNotificationInterval = 60000;
/**
* Connection manager. Fast leader election uses TCP for
* communication between peers, and QuorumCnxManager manages
* such connections.
* 连接管理者(类)。FastLeaderElection选举算法使用TCP(管理)
* 两个同辈server的通信,并且QuorumCnxManager还管理着这些连接。
*/
QuorumCnxManager manager;
/**
* Notifications are messages that let other peers know that
* a given peer has changed its vote, either because it has
* joined leader election or because it learned of another
* peer with higher zxid or same zxid and higher server id
* Notifications是一个让其它server知道当前server已经改变了
* 投票的通知消息(为什么要改变投票?),要么是因为它参与了leader
* 选举(新一轮投票,首先投给自己),要么是它具有更大的zxid,或者
* zxid相同但是ServerID(myid)更大。
*/
static public class Notification {
/*
* Format version, introduced in 3.4.6
*/
public final static int CURRENTVERSION = 0x1;
int version;
/*
* Proposed leader
* 当前选票所推荐做leader的ServerID
*/
long leader;
/*
* zxid of the proposed leader
* 当前选票所推荐做leader的最大zxid
*/
long zxid;
/*
* Epoch
* 当前本轮选举的epoch,即逻辑时钟
*/
long electionEpoch;
/*
* current state of sender
* 当前通知的发送者的状态(四种状态)
*/
QuorumPeer.ServerState state;
/*
* Address of sender
* 当前通知发送者的serverID
*/
long sid;
/*
* epoch of the proposed leader
* 当前选票所推荐做leader的epoch
*/
long peerEpoch;
@Override
public String toString() {
return Long.toHexString(version) + " (message format version), "
+ leader + " (n.leader), 0x"
+ Long.toHexString(zxid) + " (n.zxid), 0x"
+ Long.toHexString(electionEpoch) + " (n.round), " + state
+ " (n.state), " + sid + " (n.sid), 0x"
+ Long.toHexString(peerEpoch) + " (n.peerEpoch) ";
}
}
static ByteBuffer buildMsg(int state,
long leader,
long zxid,
long electionEpoch,
long epoch) {
byte requestBytes[] = new byte[40];
ByteBuffer requestBuffer = ByteBuffer.wrap(requestBytes);
/*
* Building notification packet to send
*/
requestBuffer.clear();
requestBuffer.putInt(state);
requestBuffer.putLong(leader);
requestBuffer.putLong(zxid);
requestBuffer.putLong(electionEpoch);
requestBuffer.putLong(epoch);
requestBuffer.putInt(Notification.CURRENTVERSION);
return requestBuffer;
}
/**
* Messages that a peer wants to send to other peers.
* These messages can be both Notifications and Acks
* of reception of notification.
*/
static public class ToSend {
static enum mType {crequest, challenge, notification, ack}
ToSend(mType type,
long leader,
long zxid,
long electionEpoch,
ServerState state,
long sid,
long peerEpoch) {
this.leader = leader;
this.zxid = zxid;
this.electionEpoch = electionEpoch;
this.state = state;
this.sid = sid;
this.peerEpoch = peerEpoch;
}
/*
* Proposed leader in the case of notification
*/
long leader;
/*
* id contains the tag for acks, and zxid for notifications
*/
long zxid;
/*
* Epoch 当前通知所在选举的逻辑时钟
*/
long electionEpoch;
/*
* Current state;
*/
QuorumPeer.ServerState state;
/*
* Address of recipient
*/
long sid;
/*
* Leader epoch
*/
long peerEpoch;
}
LinkedBlockingQueue<ToSend> sendqueue;
LinkedBlockingQueue<Notification> recvqueue;
/**
* Multi-threaded implementation of message handler. Messenger
* implements two sub-classes: WorkReceiver and WorkSender. The
* functionality of each is obvious from the name. Each of these
* spawns a new thread.
*/
protected class Messenger {
/**
* Receives messages from instance of QuorumCnxManager on
* method run(), and processes such messages.
*/
class WorkerReceiver extends ZooKeeperThread {
volatile boolean stop;
QuorumCnxManager manager;
WorkerReceiver(QuorumCnxManager manager) {
super("WorkerReceiver");
this.stop = false;
this.manager = manager;
}
public void run() {
Message response;
while (!stop) {
// Sleeps on receive
try{
response = manager.pollRecvQueue(3000, TimeUnit.MILLISECONDS);
if(response == null) continue;
/*
* If it is from an observer, respond right away.
* Note that the following predicate assumes that
* if a server is not a follower, then it must be
* an observer. If we ever have any other type of
* learner in the future, we'll have to change the
* way we check for observers.
*/
if(!validVoter(response.sid)){
Vote current = self.getCurrentVote();
ToSend notmsg = new ToSend(ToSend.mType.notification,
current.getId(),
current.getZxid(),
logicalclock.get(),
self.getPeerState(),
response.sid,
current.getPeerEpoch());
sendqueue.offer(notmsg);
} else {
// Receive new message
if (LOG.isDebugEnabled()) {
LOG.debug("Receive new notification message. My id = "
+ self.getId());
}
/*
* We check for 28 bytes for backward compatibility
*/
if (response.buffer.capacity() < 28) {
LOG.error("Got a short response: "
+ response.buffer.capacity());
continue;
}
boolean backCompatibility = (response.buffer.capacity() == 28);
response.buffer.clear();
// Instantiate Notification and set its attributes
Notification n = new Notification();
// State of peer that sent this message
QuorumPeer.ServerState ackstate = QuorumPeer.ServerState.LOOKING;
switch (response.buffer.getInt()) {
case 0:
ackstate = QuorumPeer.ServerState.LOOKING;
break;
case 1:
ackstate = QuorumPeer.ServerState.FOLLOWING;
break;
case 2:
ackstate = QuorumPeer.ServerState.LEADING;
break;
case 3:
ackstate = QuorumPeer.ServerState.OBSERVING;
break;
default:
continue;
}
n.leader = response.buffer.getLong();
n.zxid = response.buffer.getLong();
n.electionEpoch = response.buffer.getLong();
n.state = ackstate;
n.sid = response.sid;
if(!backCompatibility){
n.peerEpoch = response.buffer.getLong();
} else {
if(LOG.isInfoEnabled()){
LOG.info("Backward compatibility mode, server id=" + n.sid);
}
n.peerEpoch = ZxidUtils.getEpochFromZxid(n.zxid);
}
/*
* Version added in 3.4.6
*/
n.version = (response.buffer.remaining() >= 4) ?
response.buffer.getInt() : 0x0;
/*
* Print notification info
*/
if(LOG.isInfoEnabled()){
printNotification(n);
}
/*
* If this server is looking, then send proposed leader
*/
if(self.getPeerState() == QuorumPeer.ServerState.LOOKING){
recvqueue.offer(n);
/*
* Send a notification back if the peer that sent this
* message is also looking and its logical clock is
* lagging behind.
*/
if((ackstate == QuorumPeer.ServerState.LOOKING)
&& (n.electionEpoch < logicalclock.get())){
Vote v = getVote();
ToSend notmsg = new ToSend(ToSend.mType.notification,
v.getId(),
v.getZxid(),
logicalclock.get(),
self.getPeerState(),
response.sid,
v.getPeerEpoch());
sendqueue.offer(notmsg);
}
} else {
/*
* If this server is not looking, but the one that sent the ack
* is looking, then send back what it believes to be the leader.
*/
Vote current = self.getCurrentVote();
if(ackstate == QuorumPeer.ServerState.LOOKING){
if(LOG.isDebugEnabled()){
LOG.debug("Sending new notification. My id = " +
self.getId() + " recipient=" +
response.sid + " zxid=0x" +
Long.toHexString(current.getZxid()) +
" leader=" + current.getId());
}
ToSend notmsg;
if(n.version > 0x0) {
notmsg = new ToSend(
ToSend.mType.notification,
current.getId(),
current.getZxid(),
current.getElectionEpoch(),
self.getPeerState(),
response.sid,
current.getPeerEpoch());
} else {
Vote bcVote = self.getBCVote();
notmsg = new ToSend(
ToSend.mType.notification,
bcVote.getId(),
bcVote.getZxid(),
bcVote.getElectionEpoch(),
self.getPeerState(),
response.sid,
bcVote.getPeerEpoch());
}
sendqueue.offer(notmsg);
}
}
}
} catch (InterruptedException e) {
System.out.println("Interrupted Exception while waiting for new message" +
e.toString());
}
}
LOG.info("WorkerReceiver is down");
}
}
/**
* This worker simply dequeues a message to send and
* and queues it on the manager's queue.
*/
class WorkerSender extends ZooKeeperThread {
volatile boolean stop;
QuorumCnxManager manager;
WorkerSender(QuorumCnxManager manager){
super("WorkerSender");
this.stop = false;
this.manager = manager;
}
public void run() {
while (!stop) {
try {
ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS);
if(m == null) continue;
process(m);
} catch (InterruptedException e) {
break;
}
}
LOG.info("WorkerSender is down");
}
/**
* Called by run() once there is a new message to send.
*
* @param m message to send
*/
void process(ToSend m) {
ByteBuffer requestBuffer = buildMsg(m.state.ordinal(),
m.leader,
m.zxid,
m.electionEpoch,
m.peerEpoch);
manager.toSend(m.sid, requestBuffer);
}
}
WorkerSender ws;
WorkerReceiver wr;
/**
* Constructor of class Messenger.
*
* @param manager Connection manager
*/
Messenger(QuorumCnxManager manager) {
this.ws = new WorkerSender(manager);
Thread t = new Thread(this.ws,
"WorkerSender[myid=" + self.getId() + "]");
t.setDaemon(true);
t.start();
this.wr = new WorkerReceiver(manager);
t = new Thread(this.wr,
"WorkerReceiver[myid=" + self.getId() + "]");
t.setDaemon(true);
t.start();
}
/**
* Stops instances of WorkerSender and WorkerReceiver
*/
void halt(){
this.ws.stop = true;
this.wr.stop = true;
}
}
QuorumPeer self; //当前参与选举的server(当前主机)
Messenger messenger;
//logicalclock逻辑时钟,原子整型
AtomicLong logicalclock = new AtomicLong(); /* Election instance */
//记录当前server的推荐情况
long proposedLeader;
long proposedZxid;
long proposedEpoch;
/**
* Returns the current vlue of the logical clock counter
*/
public long getLogicalClock(){
return logicalclock.get();
}
/**
* Constructor of FastLeaderElection. It takes two parameters, one
* is the QuorumPeer object that instantiated this object, and the other
* is the connection manager. Such an object should be created only once
* by each peer during an instance of the ZooKeeper service.
*
* @param self QuorumPeer that created this object
* @param manager Connection manager
*/
public FastLeaderElection(QuorumPeer self, QuorumCnxManager manager){
this.stop = false;
this.manager = manager;
starter(self, manager);
}
/**
* This method is invoked by the constructor. Because it is a
* part of the starting procedure of the object that must be on
* any constructor of this class, it is probably best to keep as
* a separate method. As we have a single constructor currently,
* it is not strictly necessary to have it separate.
*
* @param self QuorumPeer that created this object
* @param manager Connection manager
*/
private void starter(QuorumPeer self, QuorumCnxManager manager) {
this.self = self;
proposedLeader = -1;
proposedZxid = -1;
sendqueue = new LinkedBlockingQueue<ToSend>();
recvqueue = new LinkedBlockingQueue<Notification>();
this.messenger = new Messenger(manager);
}
private void leaveInstance(Vote v) {
if(LOG.isDebugEnabled()){
LOG.debug("About to leave FLE instance: leader="
+ v.getId() + ", zxid=0x" +
Long.toHexString(v.getZxid()) + ", my id=" + self.getId()
+ ", my state=" + self.getPeerState());
}
recvqueue.clear();
}
public QuorumCnxManager getCnxManager(){
return manager;
}
volatile boolean stop;
public void shutdown(){
stop = true;
LOG.debug("Shutting down connection manager");
manager.halt();
LOG.debug("Shutting down messenger");
messenger.halt();
LOG.debug("FLE is down");
}
/**
* Send notifications to all peers upon a change in our vote
*/
private void sendNotifications() {
/* self.getVotingView()是一个map,key为serverID,value为具有选举权的server
* */
for (QuorumServer server : self.getVotingView().values()) {
//获取当前serverID
long sid = server.id;
//notmsg=>notification msg 封装推荐信息
ToSend notmsg = new ToSend(ToSend.mType.notification,
proposedLeader,
proposedZxid,
logicalclock.get(),//挂掉的Leader的epoch+1,新一轮选举的逻辑时钟
QuorumPeer.ServerState.LOOKING,
sid, //指定接收者的serverID
proposedEpoch);
if(LOG.isDebugEnabled()){
LOG.debug("Sending Notification: " + proposedLeader + " (n.leader), 0x" +
Long.toHexString(proposedZxid) + " (n.zxid), 0x" + Long.toHexString(logicalclock.get()) +
" (n.round), " + sid + " (recipient), " + self.getId() +
" (myid), 0x" + Long.toHexString(proposedEpoch) + " (n.peerEpoch)");
}
//将推荐信息构成的数据结构notmsg添加到一个队列
sendqueue.offer(notmsg);
}
}
private void printNotification(Notification n){
LOG.info("Notification: " + n.toString()
+ self.getPeerState() + " (my state)");
}
/**
* Check if a pair (server id, zxid) succeeds our
* current vote.
*
* @param id Server identifier
* @param zxid Last zxid observed by the issuer of this vote
*/
protected boolean totalOrderPredicate(long newId, long newZxid, long newEpoch, long curId, long curZxid, long curEpoch) {
LOG.debug("id: " + newId + ", proposed id: " + curId + ", zxid: 0x" +
Long.toHexString(newZxid) + ", proposed zxid: 0x" + Long.toHexString(curZxid));
//对于Observer来说,其权重为0
if(self.getQuorumVerifier().getWeight(newId) == 0){
return false;
}
/*
* We return true if one of the following three cases hold:
* 1- New epoch is higher
* 2- New epoch is the same as current epoch, but new zxid is higher
* 3- New epoch is the same as current epoch, new zxid is the same
* as current zxid, but server id is higher.
*/
/* Leader的比较逻辑:
* 如果n的epoch大于当前返回true;
* 如果n的epoch等于当前,并且n的zxid大于当前zxid返回true;
* 如果n的epoch等于当前,并且n的myid大于当前myid返回true;
* */
return ((newEpoch > curEpoch) ||
((newEpoch == curEpoch) &&
((newZxid > curZxid) || ((newZxid == curZxid) && (newId > curId)))));
}
/**
* Termination predicate. Given a set of votes, determines if
* have sufficient to declare the end of the election round.
*
* @param votes Set of votes
* @param l Identifier of the vote received last
* @param zxid zxid of the the vote received last
*/
protected boolean termPredicate(
HashMap<Long, Vote> votes,
Vote vote) {
HashSet<Long> set = new HashSet<Long>();
/*
* First make the views consistent. Sometimes peers will have
* different zxids for a server depending on timing.
*/
//循环遍历票箱中的所有选票
for (Map.Entry<Long,Vote> entry : votes.entrySet()) {
if (vote.equals(entry.getValue())){
//统计计票
set.add(entry.getKey());
}
}
return self.getQuorumVerifier().containsQuorum(set);
}
/**
* In the case there is a leader elected, and a quorum supporting
* this leader, we have to check if the leader has voted and acked
* that it is leading. We need this check to avoid that peers keep
* electing over and over a peer that has crashed and it is no
* longer leading.
*
* @param votes set of votes
* @param leader leader id
* @param electionEpoch epoch id
*/
protected boolean checkLeader(
HashMap<Long, Vote> votes,
long leader,
long electionEpoch){
boolean predicate = true;
/*
* If everyone else thinks I'm the leader, I must be the leader.
* The other two checks are just for the case in which I'm not the
* leader. If I'm not the leader and I haven't received a message
* from leader stating that it is leading, then predicate is false.
*/
if(leader != self.getId()){
if(votes.get(leader) == null) predicate = false;
else if(votes.get(leader).getState() != ServerState.LEADING) predicate = false;
} else if(logicalclock.get() != electionEpoch) {
predicate = false;
}
return predicate;
}
/**
* This predicate checks that a leader has been elected. It doesn't
* make a lot of sense without context (check lookForLeader) and it
* has been separated for testing purposes.
*
* @param recv map of received votes
* @param ooe map containing out of election votes (LEADING or FOLLOWING)
* @param n Notification
* @return
*/
protected boolean ooePredicate(HashMap<Long,Vote> recv,
HashMap<Long,Vote> ooe,
Notification n) {
return (termPredicate(recv, new Vote(n.version,
n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch,
n.state))
&& checkLeader(ooe, n.leader, n.electionEpoch));
}
synchronized void updateProposal(long leader, long zxid, long epoch){
if(LOG.isDebugEnabled()){
LOG.debug("Updating proposal: " + leader + " (newleader), 0x"
+ Long.toHexString(zxid) + " (newzxid), " + proposedLeader
+ " (oldleader), 0x" + Long.toHexString(proposedZxid) + " (oldzxid)");
}
//修改当前server的推荐信息(投票信息)
proposedLeader = leader;
proposedZxid = zxid;
proposedEpoch = epoch;
}
synchronized Vote getVote(){
return new Vote(proposedLeader, proposedZxid, proposedEpoch);
}
/**
* A learning state can be either FOLLOWING or OBSERVING.
* This method simply decides which one depending on the
* role of the server.
*
* @return ServerState
*/
private ServerState learningState(){
if(self.getLearnerType() == LearnerType.PARTICIPANT){
LOG.debug("I'm a participant: " + self.getId());
return ServerState.FOLLOWING;
}
else{
LOG.debug("I'm an observer: " + self.getId());
return ServerState.OBSERVING;
}
}
/**
* Returns the initial vote value of server identifier.
*
* @return long
* 返回当前参与选举的server的id
*/
private long getInitId(){
if(self.getLearnerType() == LearnerType.PARTICIPANT)
return self.getId();
else return Long.MIN_VALUE;
}
/**
* Returns initial last logged zxid.
*
* @return long
* 返回当前server最后记录的zxid,即 max zxid
*/
private long getInitLastLoggedZxid(){
if(self.getLearnerType() == LearnerType.PARTICIPANT)
return self.getLastLoggedZxid();
else return Long.MIN_VALUE;
}
/**
* Returns the initial vote value of the peer epoch.
*
* @return long
* 返回当前server的epoch,即上一轮leader的epoch
*/
private long getPeerEpoch(){
if(self.getLearnerType() == LearnerType.PARTICIPANT)
try {
return self.getCurrentEpoch();
} catch(IOException e) {
RuntimeException re = new RuntimeException(e.getMessage());
re.setStackTrace(e.getStackTrace());
throw re;
}
else return Long.MIN_VALUE;
}
/**
* Starts a new round of leader election. Whenever our QuorumPeer
* changes its state to LOOKING, this method is invoked, and it
* sends notifications to all other peers.
* 开启新一轮的leader选举。无论何时,只要我们的QuorumPeer的
* 状态变为了LOOKING,那么这个方法将被调用,并且它会发送notifications
* 给所有其它的同级服务器。
*/
public Vote lookForLeader() throws InterruptedException {
/*---1--- 创建选举对象,做选举前的初始化工作
* jmx,Java Management eXtensions,Oracle提供的分布式应用程序监控技术*/
try {
self.jmxLeaderElectionBean = new LeaderElectionBean();
MBeanRegistry.getInstance().register(
self.jmxLeaderElectionBean, self.jmxLocalPeerBean);
} catch (Exception e) {
LOG.warn("Failed to register with JMX", e);
self.jmxLeaderElectionBean = null;
}
if (self.start_fle == 0) {
/*推荐使用currentElapsedTime()获取时间,细粒度纳秒返回毫秒,获取时间源为JVM
不推荐使用currentTimeMillis(),系统时间源,可修改;*/
//获取与当前JVM高分辨率时间源相对的时间,单位为毫秒
self.start_fle = Time.currentElapsedTime();
}
try {
/*记录当前server收到的来自于其他server的本轮投票信息,
* key为投票者的serverID,value为选票。
* recvset集合就相当于票箱。*/
HashMap<Long, Vote> recvset = new HashMap<Long, Vote>();
/*用于记录所有退出(淘汰)选举的选票(不合法的选票)*/
HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>();
//notTimeout=》notification timeout
int notTimeout = finalizeWait;
/*---2--- 将自己作为新Leader投出去(首先自己选自己) */
synchronized(this){
//逻辑时钟加1(新一轮选举开始,逻辑时间必须更新)
logicalclock.incrementAndGet();
/* 更新选票:将自己推荐为初始化leader
* */
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
LOG.info("New election. My id = " + self.getId() +
", proposed zxid=0x" + Long.toHexString(proposedZxid));
//将更新过的选票放入队列(发送逻辑不在此处)
sendNotifications();
/*---3--- 验证当前自己的选票与其它的选票谁更适合做Leader(核心) */
/*
* Loop in which we exchange notifications until we find a leader
*/
//当前server的状态必须为LOOKING并且选举未结束
while ((self.getPeerState() == ServerState.LOOKING) &&
(!stop)){
/*
* Remove next notification from queue, times out after 2 times
* the termination time
* recvqueue本质是一个带有头节点的链表
*/
/*发送和接收选票通知的逻辑为:当前server发送几个就要收回几个,收回的通知
* 存放在recvqueue中;
* 此处就是通过recvqueue.poll()方法从recvqueue中取出一个通知;*/
Notification n = recvqueue.poll(notTimeout,
TimeUnit.MILLISECONDS);
/*
* Sends more notifications if haven't received enough.
* Otherwise processes new notification.
*/
/*选举没有结束,但是n中的收回的消息通知为空,就发送更多的通知
* 正常状态是当前server发送多少消息就会回收多少通知:即9个server,去掉自己
* 共计发送8个消息,就会接收到8个通知;
* 1 (不常见)处理了其中5个通知,到第6个时发送极端情况(if (n.electionEpoch > logicalclock.get())),
* 如果发送者的epoch大于本轮选举逻辑时钟,说明本轮选举已经过时,致使后续处理未能过半,n就空了。
* 2 (常见)当前server发送了8个消息,在超时时限内只收到了4个消息,即未能收全(未过半);*/
if(n == null){
/*manager.haveDelivered()若返回true,说明当前server与集群连接没有问题*/
if(manager.haveDelivered()){
/*重新向外发送选票提案,目的是为了再次接收其它server通知,
* 收回的通知中有重复的不会放入recvset票箱,因为key相同,不接受;*/
sendNotifications();
/*manager.haveDelivered()若返回false,说明当前server与集群失联*/
} else {
//强制重连:连接其它server(没有重新发送选票消息)
// 连接所有其它Server,但没有重新发布通知,为什么呢?
// 当前Server若与集群失联,则其它Server一定不可能收到当前Server发送的通知,
// 所以那些Server就会执行前面的sendNotifications()代码,重新发送通知。那么,此时当前
// server就会收到那个Server所发送的通知。同理,就会收到其它所有Server所发送的通知
// 所以这里仅需连接上所有其它Server即可,不用重新发送通知。
/*设计思路(设计精妙):前提,选举未结束,不满半数,当前server失联,发送消息、接收通知都失败;
* 其它server向当前server发送的消息失败,那么就会选择重发,所有当前server重连后不需要发送消息,所有其它的server都会
* 向当前server发送消息,其会接收到所有选票消息,只要返回通知即完成了通信; */
manager.connectAll();
}
/*
* Exponential backoff
*/
/*延长超时时限,最长不会超过60秒*/
int tmpTimeOut = notTimeout*2;
notTimeout = (tmpTimeOut < maxNotificationInterval?
tmpTimeOut : maxNotificationInterval);
LOG.info("Notification time out: " + notTimeout);
}
/*从回收选票消息队列recvqueue中取出的n不为空,
* 1 validVoter(n.sid)验证当前取出的选票消息sid(发送者)是否合法
* 2 validVoter(n.leader)验证发送者推荐的leader是否合法
* 即验证通知发送者是否为有权限的参与者(validVoter中存储的都是PARTICIPANT参与者),
* 并且验证PARTICIPANT参与者推荐的leader是否在validVoter中,即合法(是否拥有选举权和被选举权)*/
else if(validVoter(n.sid) && validVoter(n.leader)) {
/*
* Only proceed if the vote comes from a replica in the
* voting view for a replica in the voting view.
*/
switch (n.state) { //判断发送者当前状态
case LOOKING: //状态为LOOKING
// If notification > current, replace and send messages out
/*如果发送者的epoch大于本轮选举逻辑时钟,说明本轮选举已经过时
* n.electionEpoch为发送者的逻辑时钟,logicalclock.get()为当前server的逻辑时钟,
* 正常情况下为相等,它们都是当原有Leader挂掉后,其Epoch+1产生的新一轮逻辑时钟,
* 只是角度不同,叫法不同;
* 极端情况会造成值不同(本轮刚选举的leader未同步就挂掉)*/
if (n.electionEpoch > logicalclock.get()) {
//更新本轮选举的逻辑时钟,使已经过时的本轮选举变为当下选举
logicalclock.set(n.electionEpoch);
//清空投票信息(票箱)
recvset.clear();
/*判断n推荐的leader与当前server谁更适合做leader
* 即"我选我"的变形,无论是n还是当前server更适合做leader,
* 都需要更新选票提案,并将更新过的提案放入发布队列。*/
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
updateProposal(n.leader, n.zxid, n.peerEpoch);//n适合
} else {//当前server适合
updateProposal(getInitId(),
getInitLastLoggedZxid(),
getPeerEpoch());
}
sendNotifications();
/*如果发送者的epoch小于当前的逻辑时钟,说明发送者参与的选举过时了
* n.electionEpoch为发送者的逻辑时钟,logicalclock.get()为当前server的逻辑时钟,
* 正常情况下为相等,它们都是当原有Leader挂掉后,其Epoch+1产生的新一轮逻辑时钟,
* 只是角度不同,叫法不同*/
} else if (n.electionEpoch < logicalclock.get()) {
if(LOG.isDebugEnabled()){
LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"
+ Long.toHexString(n.electionEpoch)
+ ", logicalclock=0x" + Long.toHexString(logicalclock.get()));
}
break;
/*比较发送者推荐的leader和我当前推荐的leader那个更适合做Leader,
* 我推荐更适合返回false,n发送者推荐更适合返回true*/
} else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)) {
//n推荐更适合,更新我当前的选票
updateProposal(n.leader, n.zxid, n.peerEpoch);
sendNotifications();
}
if(LOG.isDebugEnabled()){
LOG.debug("Adding vote: from=" + n.sid +
", proposed leader=" + n.leader +
", proposed zxid=0x" + Long.toHexString(n.zxid) +
", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
}
//将来自于外部其它server的选票放入票箱(记录)
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
/*---4--- 判断本轮选举是否应该结束 */
/*判断当前server的推荐提案在票箱中的支持数量是否过半*/
if (termPredicate(recvset,
new Vote(proposedLeader, proposedZxid,
logicalclock.get(), proposedEpoch))) {
// Verify if there is any change in the proposed leader
//当前while()有两个出口
//1)循环条件:此处出去,说明在剩余的通知中没有找到任何比当前
//“过半的选票”更适合的通知;此处跳出时n未null。
//2)break:从此出去,说明在剩余的通知中找到一个比当前“过半的选票”
//更适合的通知所推荐的leader
while((n = recvqueue.poll(finalizeWait,
TimeUnit.MILLISECONDS)) != null){
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)){
recvqueue.put(n);
break;//跳出,选举未结束,继续循环
}
}
/*
* This predicate is true once we don't read any new
* relevant message from the reception queue
*/
/*若n==null为true,说明当前的“过半选票”就已经是最适合的Leader*/
if (n == null) {//找到Leader,收尾工作,修改当前主机状态
//先判断自己是否为当前过半选票,如果是就改变自己状态为LEADING;
//如果不是自己就改变状态为FOLLOWING
self.setPeerState((proposedLeader == self.getId()) ?
ServerState.LEADING: learningState());
//形成最终选票
Vote endVote = new Vote(proposedLeader,
proposedZxid,
logicalclock.get(),
proposedEpoch);
//清空队列
leaveInstance(endVote);
return endVote;
}
}
break;//结束switch和case条件,本次while循环结束,开始新的while循环获取next选票进行验证
/*---5--- 无需选举的情况 */
//若一个server可以接受到n.state为OBSERVING状态的通知,说明该server是刚刚挂掉的Leader
//内容是心跳通讯
case OBSERVING:
LOG.debug("Notification from observer: " + n.sid);
break;
/* 有三种场景会出现FOLLOWING或LEADING状态消息:
* 1)新server(非Observer)加入到正常运行的集群中,其初始状态为LOOKING,默认调用lookForLeader()
* 方法发送推荐自己为leader的消息,当前集群中的Leader和follower收到消息后会给其回复通知,
* 通知的状态分别是FOLLOWING和LEADING;
* 2)当前Leader宕机,并不是所有的follower都同时能够感知到Leader挂掉,先感知到的server改变状态
* 为LOOKING,并发送消息给其它server,但其它server还未感知到,所以它们回复给感知到的server的
* 通知状态是FOLLOWING;
* 3)本轮选举中其它server已经选举出了新的Leader,并且已经改变了状态,但还没有通知到当前server,
* 已经选举完毕的server向当前server发送通知的状态就是LEADING或FOLLOWING。
* */
case FOLLOWING:
case LEADING:
/*
* Consider all notifications from the same epoch
* together.
*/
/*处理外来通知的epoch与本轮选举的逻辑时钟相同的情况;
* 前面描述的场景3就是此处理范围;
* */
if(n.electionEpoch == logicalclock.get()){
//将通知选票放入“票箱”
recvset.put(n.sid, new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch));
if(ooePredicate(recvset, outofelection, n)) {
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
Vote endVote = new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
}
/*
* Before joining an established ensemble, verify
* a majority is following the same leader.
*/
/*以下代码用于处理:一个server加入到已经选举出Leader的集群中(存在两种情况);
* 1)本轮选举过半,已经选出leader但还没有结束;
* 2)正常运行的集群,有新的server加入;
* 这两种情况下当前server接收到的通知都处于LEADING或FOLLOWING状态,
* 也就是LOOKING以外的状态,所有存入到outofelection集合中。
* */
outofelection.put(n.sid, new Vote(n.version,
n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch,
n.state));
/*此处判断,n在[1]outofelection中是否过半,n在[2]中是否正常,结果为true
* 就改变当前server状态,跟随n(Leader)*/
if(ooePredicate(outofelection, outofelection, n)) {
synchronized(this){
//更改当前server的逻辑时钟为n(Leader)的epoch
logicalclock.set(n.electionEpoch);
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
}
Vote endVote = new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
break;
default:
LOG.warn("Notification state unrecognized: {} (n.state), {} (n.sid)",
n.state, n.sid);
break;
}
} else {
if (!validVoter(n.leader)) {
LOG.warn("Ignoring notification for non-cluster member sid {} from sid {}", n.leader, n.sid);
}
if (!validVoter(n.sid)) {
LOG.warn("Ignoring notification for sid {} from non-quorum member sid {}", n.leader, n.sid);
}
}
} //while循环-end
return null;
} finally {
try {
if(self.jmxLeaderElectionBean != null){
MBeanRegistry.getInstance().unregister(
self.jmxLeaderElectionBean);
}
} catch (Exception e) {
LOG.warn("Failed to unregister with JMX", e);
}
self.jmxLeaderElectionBean = null;
LOG.debug("Number of connection processing threads: {}",
manager.getConnectionThreadCount());
}
}
/**
* Check if a given sid is represented in either the current or
* the next voting view
*
* @param sid Server identifier
* @return boolean
*/
private boolean validVoter(long sid) {
return self.getVotingView().containsKey(sid);
}
}