zookeeper源码解析(三 选举算法源码)

zookeeper选举算法源码

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.zookeeper.server.quorum;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.zookeeper.common.Time;
import org.apache.zookeeper.jmx.MBeanRegistry;
import org.apache.zookeeper.server.ZooKeeperThread;
import org.apache.zookeeper.server.quorum.QuorumCnxManager.Message;
import org.apache.zookeeper.server.quorum.QuorumPeer.LearnerType;
import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
import org.apache.zookeeper.server.quorum.QuorumPeer.ServerState;
import org.apache.zookeeper.server.util.ZxidUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * Implementation of leader election using TCP. It uses an object of the class
 * QuorumCnxManager to manage connections. Otherwise, the algorithm is push-based
 * as with the other UDP implementations.
 *
 * There are a few parameters that can be tuned to change its behavior. First,
 * finalizeWait determines the amount of time to wait until deciding upon a leader.
 * This is part of the leader election algorithm.
 *
 * 使用TCP实现leader领导人选举,它使用QuorumCnxManager类的一个对象管理连接(与其他server间的连接管理)。
 * 否则(如果不使用QuorumCnxManager对象的话),将使用UDP基于推送的算法实现。
 *
 * 有几个参数可以用来改变它(选举)的行为。
 * 首先,finalizeWait(一个代码中的常量)决定选举一个leader的时间,
 * 这是leader选举算法的一部分。
 */


public class FastLeaderElection implements Election {
    private static final Logger LOG = LoggerFactory.getLogger(FastLeaderElection.class);

    /**
     * Determine how much time a process has to wait
     * once it believes that it has reached the end of
     * leader election.
     * (此常量)决定一个选举过程需要等待的选举时间;
     * 一经到达,将结束leader选举。
     * 默认200毫秒
     * 实际此时间是节点等待收取其他节点选举消息(Notification)的时间;
     */
    final static int finalizeWait = 200;


    /**
     * Upper bound on the amount of time between two consecutive
     * notification checks. This impacts the amount of time to get
     * the system up again after long partitions. Currently 60 seconds.
     * (此常量)指定两个连续的notification检查的时间间隔上线;
     * 其影响了系统在经历了长时间分割后再次重启的时间,默认60秒。
     * 此常量其实就是finalizeWait的最大值,代表如果在60秒内还没有选举出leader,
     * 那么重新发起一轮选举;
     */

    final static int maxNotificationInterval = 60000;

    /**
     * Connection manager. Fast leader election uses TCP for
     * communication between peers, and QuorumCnxManager manages
     * such connections.
     * 连接管理者(类)。FastLeaderElection选举算法使用TCP(管理)
     * 两个同辈server的通信,并且QuorumCnxManager还管理着这些连接。
     */

    QuorumCnxManager manager;


    /**
     * Notifications are messages that let other peers know that
     * a given peer has changed its vote, either because it has
     * joined leader election or because it learned of another
     * peer with higher zxid or same zxid and higher server id
     * Notifications是一个让其它server知道当前server已经改变了
     * 投票的通知消息(为什么要改变投票?),要么是因为它参与了leader
     * 选举(新一轮投票,首先投给自己),要么是它具有更大的zxid,或者
     * zxid相同但是ServerID(myid)更大。
     */

    static public class Notification {
        /*
         * Format version, introduced in 3.4.6
         */
        
        public final static int CURRENTVERSION = 0x1; 
        int version;
                
        /*
         * Proposed leader
         * 当前选票所推荐做leader的ServerID
         */
        long leader;

        /*
         * zxid of the proposed leader
         * 当前选票所推荐做leader的最大zxid
         */
        long zxid;

        /*
         * Epoch
         * 当前本轮选举的epoch,即逻辑时钟
         */
        long electionEpoch;

        /*
         * current state of sender
         * 当前通知的发送者的状态(四种状态)
         */
        QuorumPeer.ServerState state;

        /*
         * Address of sender
         * 当前通知发送者的serverID
         */
        long sid;

        /*
         * epoch of the proposed leader
         * 当前选票所推荐做leader的epoch
         */
        long peerEpoch;

        @Override
        public String toString() {
            return Long.toHexString(version) + " (message format version), "
                    + leader + " (n.leader), 0x"
                    + Long.toHexString(zxid) + " (n.zxid), 0x"
                    + Long.toHexString(electionEpoch) + " (n.round), " + state
                    + " (n.state), " + sid + " (n.sid), 0x"
                    + Long.toHexString(peerEpoch) + " (n.peerEpoch) ";
        }
    }
    
    static ByteBuffer buildMsg(int state,
            long leader,
            long zxid,
            long electionEpoch,
            long epoch) {
        byte requestBytes[] = new byte[40];
        ByteBuffer requestBuffer = ByteBuffer.wrap(requestBytes);

        /*
         * Building notification packet to send 
         */

        requestBuffer.clear();
        requestBuffer.putInt(state);
        requestBuffer.putLong(leader);
        requestBuffer.putLong(zxid);
        requestBuffer.putLong(electionEpoch);
        requestBuffer.putLong(epoch);
        requestBuffer.putInt(Notification.CURRENTVERSION);
        
        return requestBuffer;
    }

    /**
     * Messages that a peer wants to send to other peers.
     * These messages can be both Notifications and Acks
     * of reception of notification.
     */
    static public class ToSend {
        static enum mType {crequest, challenge, notification, ack}

        ToSend(mType type,
                long leader,
                long zxid,
                long electionEpoch,
                ServerState state,
                long sid,
                long peerEpoch) {

            this.leader = leader;
            this.zxid = zxid;
            this.electionEpoch = electionEpoch;
            this.state = state;
            this.sid = sid;
            this.peerEpoch = peerEpoch;
        }

        /*
         * Proposed leader in the case of notification
         */
        long leader;

        /*
         * id contains the tag for acks, and zxid for notifications
         */
        long zxid;

        /*
         * Epoch   当前通知所在选举的逻辑时钟
         */
        long electionEpoch;

        /*
         * Current state;
         */
        QuorumPeer.ServerState state;

        /*
         * Address of recipient
         */
        long sid;
        
        /*
         * Leader epoch
         */
        long peerEpoch;
    }

    LinkedBlockingQueue<ToSend> sendqueue;
    LinkedBlockingQueue<Notification> recvqueue;

    /**
     * Multi-threaded implementation of message handler. Messenger
     * implements two sub-classes: WorkReceiver and  WorkSender. The
     * functionality of each is obvious from the name. Each of these
     * spawns a new thread.
     */

    protected class Messenger {

        /**
         * Receives messages from instance of QuorumCnxManager on
         * method run(), and processes such messages.
         */

        class WorkerReceiver extends ZooKeeperThread {
            volatile boolean stop;
            QuorumCnxManager manager;

            WorkerReceiver(QuorumCnxManager manager) {
                super("WorkerReceiver");
                this.stop = false;
                this.manager = manager;
            }

            public void run() {

                Message response;
                while (!stop) {
                    // Sleeps on receive
                    try{
                        response = manager.pollRecvQueue(3000, TimeUnit.MILLISECONDS);
                        if(response == null) continue;

                        /*
                         * If it is from an observer, respond right away.
                         * Note that the following predicate assumes that
                         * if a server is not a follower, then it must be
                         * an observer. If we ever have any other type of
                         * learner in the future, we'll have to change the
                         * way we check for observers.
                         */
                        if(!validVoter(response.sid)){
                            Vote current = self.getCurrentVote();
                            ToSend notmsg = new ToSend(ToSend.mType.notification,
                                    current.getId(),
                                    current.getZxid(),
                                    logicalclock.get(),
                                    self.getPeerState(),
                                    response.sid,
                                    current.getPeerEpoch());

                            sendqueue.offer(notmsg);
                        } else {
                            // Receive new message
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Receive new notification message. My id = "
                                        + self.getId());
                            }

                            /*
                             * We check for 28 bytes for backward compatibility
                             */
                            if (response.buffer.capacity() < 28) {
                                LOG.error("Got a short response: "
                                        + response.buffer.capacity());
                                continue;
                            }
                            boolean backCompatibility = (response.buffer.capacity() == 28);
                            response.buffer.clear();

                            // Instantiate Notification and set its attributes
                            Notification n = new Notification();
                            
                            // State of peer that sent this message
                            QuorumPeer.ServerState ackstate = QuorumPeer.ServerState.LOOKING;
                            switch (response.buffer.getInt()) {
                            case 0:
                                ackstate = QuorumPeer.ServerState.LOOKING;
                                break;
                            case 1:
                                ackstate = QuorumPeer.ServerState.FOLLOWING;
                                break;
                            case 2:
                                ackstate = QuorumPeer.ServerState.LEADING;
                                break;
                            case 3:
                                ackstate = QuorumPeer.ServerState.OBSERVING;
                                break;
                            default:
                                continue;
                            }
                            
                            n.leader = response.buffer.getLong();
                            n.zxid = response.buffer.getLong();
                            n.electionEpoch = response.buffer.getLong();
                            n.state = ackstate;
                            n.sid = response.sid;
                            if(!backCompatibility){
                                n.peerEpoch = response.buffer.getLong();
                            } else {
                                if(LOG.isInfoEnabled()){
                                    LOG.info("Backward compatibility mode, server id=" + n.sid);
                                }
                                n.peerEpoch = ZxidUtils.getEpochFromZxid(n.zxid);
                            }

                            /*
                             * Version added in 3.4.6
                             */

                            n.version = (response.buffer.remaining() >= 4) ? 
                                         response.buffer.getInt() : 0x0;

                            /*
                             * Print notification info
                             */
                            if(LOG.isInfoEnabled()){
                                printNotification(n);
                            }

                            /*
                             * If this server is looking, then send proposed leader
                             */

                            if(self.getPeerState() == QuorumPeer.ServerState.LOOKING){
                                recvqueue.offer(n);

                                /*
                                 * Send a notification back if the peer that sent this
                                 * message is also looking and its logical clock is
                                 * lagging behind.
                                 */
                                if((ackstate == QuorumPeer.ServerState.LOOKING)
                                        && (n.electionEpoch < logicalclock.get())){
                                    Vote v = getVote();
                                    ToSend notmsg = new ToSend(ToSend.mType.notification,
                                            v.getId(),
                                            v.getZxid(),
                                            logicalclock.get(),
                                            self.getPeerState(),
                                            response.sid,
                                            v.getPeerEpoch());
                                    sendqueue.offer(notmsg);
                                }
                            } else {
                                /*
                                 * If this server is not looking, but the one that sent the ack
                                 * is looking, then send back what it believes to be the leader.
                                 */
                                Vote current = self.getCurrentVote();
                                if(ackstate == QuorumPeer.ServerState.LOOKING){
                                    if(LOG.isDebugEnabled()){
                                        LOG.debug("Sending new notification. My id =  " +
                                                self.getId() + " recipient=" +
                                                response.sid + " zxid=0x" +
                                                Long.toHexString(current.getZxid()) +
                                                " leader=" + current.getId());
                                    }
                                    
                                    ToSend notmsg;
                                    if(n.version > 0x0) {
                                        notmsg = new ToSend(
                                                ToSend.mType.notification,
                                                current.getId(),
                                                current.getZxid(),
                                                current.getElectionEpoch(),
                                                self.getPeerState(),
                                                response.sid,
                                                current.getPeerEpoch());
                                        
                                    } else {
                                        Vote bcVote = self.getBCVote();
                                        notmsg = new ToSend(
                                                ToSend.mType.notification,
                                                bcVote.getId(),
                                                bcVote.getZxid(),
                                                bcVote.getElectionEpoch(),
                                                self.getPeerState(),
                                                response.sid,
                                                bcVote.getPeerEpoch());
                                    }
                                    sendqueue.offer(notmsg);
                                }
                            }
                        }
                    } catch (InterruptedException e) {
                        System.out.println("Interrupted Exception while waiting for new message" +
                                e.toString());
                    }
                }
                LOG.info("WorkerReceiver is down");
            }
        }


        /**
         * This worker simply dequeues a message to send and
         * and queues it on the manager's queue.
         */

        class WorkerSender extends ZooKeeperThread {
            volatile boolean stop;
            QuorumCnxManager manager;

            WorkerSender(QuorumCnxManager manager){
                super("WorkerSender");
                this.stop = false;
                this.manager = manager;
            }

            public void run() {
                while (!stop) {
                    try {
                        ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS);
                        if(m == null) continue;

                        process(m);
                    } catch (InterruptedException e) {
                        break;
                    }
                }
                LOG.info("WorkerSender is down");
            }

            /**
             * Called by run() once there is a new message to send.
             *
             * @param m     message to send
             */
            void process(ToSend m) {
                ByteBuffer requestBuffer = buildMsg(m.state.ordinal(), 
                                                        m.leader,
                                                        m.zxid, 
                                                        m.electionEpoch, 
                                                        m.peerEpoch);
                manager.toSend(m.sid, requestBuffer);
            }
        }


        WorkerSender ws;
        WorkerReceiver wr;

        /**
         * Constructor of class Messenger.
         *
         * @param manager   Connection manager
         */
        Messenger(QuorumCnxManager manager) {

            this.ws = new WorkerSender(manager);

            Thread t = new Thread(this.ws,
                    "WorkerSender[myid=" + self.getId() + "]");
            t.setDaemon(true);
            t.start();

            this.wr = new WorkerReceiver(manager);

            t = new Thread(this.wr,
                    "WorkerReceiver[myid=" + self.getId() + "]");
            t.setDaemon(true);
            t.start();
        }

        /**
         * Stops instances of WorkerSender and WorkerReceiver
         */
        void halt(){
            this.ws.stop = true;
            this.wr.stop = true;
        }

    }

    QuorumPeer self;  //当前参与选举的server(当前主机)
    Messenger messenger;
    //logicalclock逻辑时钟,原子整型
    AtomicLong logicalclock = new AtomicLong(); /* Election instance */
    //记录当前server的推荐情况
    long proposedLeader;
    long proposedZxid;
    long proposedEpoch;


    /**
     * Returns the current vlue of the logical clock counter
     */
    public long getLogicalClock(){
        return logicalclock.get();
    }

    /**
     * Constructor of FastLeaderElection. It takes two parameters, one
     * is the QuorumPeer object that instantiated this object, and the other
     * is the connection manager. Such an object should be created only once
     * by each peer during an instance of the ZooKeeper service.
     *
     * @param self  QuorumPeer that created this object
     * @param manager   Connection manager
     */
    public FastLeaderElection(QuorumPeer self, QuorumCnxManager manager){
        this.stop = false;
        this.manager = manager;
        starter(self, manager);
    }

    /**
     * This method is invoked by the constructor. Because it is a
     * part of the starting procedure of the object that must be on
     * any constructor of this class, it is probably best to keep as
     * a separate method. As we have a single constructor currently,
     * it is not strictly necessary to have it separate.
     *
     * @param self      QuorumPeer that created this object
     * @param manager   Connection manager
     */
    private void starter(QuorumPeer self, QuorumCnxManager manager) {
        this.self = self;
        proposedLeader = -1;
        proposedZxid = -1;

        sendqueue = new LinkedBlockingQueue<ToSend>();
        recvqueue = new LinkedBlockingQueue<Notification>();
        this.messenger = new Messenger(manager);
    }

    private void leaveInstance(Vote v) {
        if(LOG.isDebugEnabled()){
            LOG.debug("About to leave FLE instance: leader="
                + v.getId() + ", zxid=0x" +
                Long.toHexString(v.getZxid()) + ", my id=" + self.getId()
                + ", my state=" + self.getPeerState());
        }
        recvqueue.clear();
    }

    public QuorumCnxManager getCnxManager(){
        return manager;
    }

    volatile boolean stop;
    public void shutdown(){
        stop = true;
        LOG.debug("Shutting down connection manager");
        manager.halt();
        LOG.debug("Shutting down messenger");
        messenger.halt();
        LOG.debug("FLE is down");
    }


    /**
     * Send notifications to all peers upon a change in our vote
     */
    private void sendNotifications() {
        /* self.getVotingView()是一个map,key为serverID,value为具有选举权的server
        * */
        for (QuorumServer server : self.getVotingView().values()) {
            //获取当前serverID
            long sid = server.id;
            //notmsg=>notification msg 封装推荐信息
            ToSend notmsg = new ToSend(ToSend.mType.notification,
                    proposedLeader,
                    proposedZxid,
                    logicalclock.get(),//挂掉的Leader的epoch+1,新一轮选举的逻辑时钟
                    QuorumPeer.ServerState.LOOKING,
                    sid,   //指定接收者的serverID
                    proposedEpoch);
            if(LOG.isDebugEnabled()){
                LOG.debug("Sending Notification: " + proposedLeader + " (n.leader), 0x"  +
                      Long.toHexString(proposedZxid) + " (n.zxid), 0x" + Long.toHexString(logicalclock.get())  +
                      " (n.round), " + sid + " (recipient), " + self.getId() +
                      " (myid), 0x" + Long.toHexString(proposedEpoch) + " (n.peerEpoch)");
            }
            //将推荐信息构成的数据结构notmsg添加到一个队列
            sendqueue.offer(notmsg);
        }
    }


    private void printNotification(Notification n){
        LOG.info("Notification: " + n.toString()
                + self.getPeerState() + " (my state)");
    }

    /**
     * Check if a pair (server id, zxid) succeeds our
     * current vote.
     *
     * @param id    Server identifier
     * @param zxid  Last zxid observed by the issuer of this vote
     */
    protected boolean totalOrderPredicate(long newId, long newZxid, long newEpoch, long curId, long curZxid, long curEpoch) {
        LOG.debug("id: " + newId + ", proposed id: " + curId + ", zxid: 0x" +
                Long.toHexString(newZxid) + ", proposed zxid: 0x" + Long.toHexString(curZxid));
        //对于Observer来说,其权重为0
        if(self.getQuorumVerifier().getWeight(newId) == 0){
            return false;
        }
        
        /*
         * We return true if one of the following three cases hold:
         * 1- New epoch is higher
         * 2- New epoch is the same as current epoch, but new zxid is higher
         * 3- New epoch is the same as current epoch, new zxid is the same
         *  as current zxid, but server id is higher.
         */
        /* Leader的比较逻辑:
         * 如果n的epoch大于当前返回true;
         * 如果n的epoch等于当前,并且n的zxid大于当前zxid返回true;
         * 如果n的epoch等于当前,并且n的myid大于当前myid返回true;
         *  */
        return ((newEpoch > curEpoch) || 
                ((newEpoch == curEpoch) &&
                ((newZxid > curZxid) || ((newZxid == curZxid) && (newId > curId)))));
    }

    /**
     * Termination predicate. Given a set of votes, determines if
     * have sufficient to declare the end of the election round.
     *
     *  @param votes    Set of votes
     *  @param l        Identifier of the vote received last
     *  @param zxid     zxid of the the vote received last
     */
    protected boolean termPredicate(
            HashMap<Long, Vote> votes,
            Vote vote) {

        HashSet<Long> set = new HashSet<Long>();

        /*
         * First make the views consistent. Sometimes peers will have
         * different zxids for a server depending on timing.
         */
        //循环遍历票箱中的所有选票
        for (Map.Entry<Long,Vote> entry : votes.entrySet()) {
            if (vote.equals(entry.getValue())){
                //统计计票
                set.add(entry.getKey());
            }
        }

        return self.getQuorumVerifier().containsQuorum(set);
    }

    /**
     * In the case there is a leader elected, and a quorum supporting
     * this leader, we have to check if the leader has voted and acked
     * that it is leading. We need this check to avoid that peers keep
     * electing over and over a peer that has crashed and it is no
     * longer leading.
     *
     * @param votes set of votes
     * @param   leader  leader id
     * @param   electionEpoch   epoch id
     */
    protected boolean checkLeader(
            HashMap<Long, Vote> votes,
            long leader,
            long electionEpoch){

        boolean predicate = true;

        /*
         * If everyone else thinks I'm the leader, I must be the leader.
         * The other two checks are just for the case in which I'm not the
         * leader. If I'm not the leader and I haven't received a message
         * from leader stating that it is leading, then predicate is false.
         */

        if(leader != self.getId()){
            if(votes.get(leader) == null) predicate = false;
            else if(votes.get(leader).getState() != ServerState.LEADING) predicate = false;
        } else if(logicalclock.get() != electionEpoch) {
            predicate = false;
        } 

        return predicate;
    }
    
    /**
     * This predicate checks that a leader has been elected. It doesn't
     * make a lot of sense without context (check lookForLeader) and it
     * has been separated for testing purposes.
     * 
     * @param recv  map of received votes 
     * @param ooe   map containing out of election votes (LEADING or FOLLOWING)
     * @param n     Notification
     * @return          
     */
    protected boolean ooePredicate(HashMap<Long,Vote> recv, 
                                    HashMap<Long,Vote> ooe, 
                                    Notification n) {
        
        return (termPredicate(recv, new Vote(n.version, 
                                             n.leader,
                                             n.zxid, 
                                             n.electionEpoch, 
                                             n.peerEpoch, 
                                             n.state))
                && checkLeader(ooe, n.leader, n.electionEpoch));
        
    }

    synchronized void updateProposal(long leader, long zxid, long epoch){
        if(LOG.isDebugEnabled()){
            LOG.debug("Updating proposal: " + leader + " (newleader), 0x"
                    + Long.toHexString(zxid) + " (newzxid), " + proposedLeader
                    + " (oldleader), 0x" + Long.toHexString(proposedZxid) + " (oldzxid)");
        }
        //修改当前server的推荐信息(投票信息)
        proposedLeader = leader;
        proposedZxid = zxid;
        proposedEpoch = epoch;
    }

    synchronized Vote getVote(){
        return new Vote(proposedLeader, proposedZxid, proposedEpoch);
    }

    /**
     * A learning state can be either FOLLOWING or OBSERVING.
     * This method simply decides which one depending on the
     * role of the server.
     *
     * @return ServerState
     */
    private ServerState learningState(){
        if(self.getLearnerType() == LearnerType.PARTICIPANT){
            LOG.debug("I'm a participant: " + self.getId());
            return ServerState.FOLLOWING;
        }
        else{
            LOG.debug("I'm an observer: " + self.getId());
            return ServerState.OBSERVING;
        }
    }

    /**
     * Returns the initial vote value of server identifier.
     *
     * @return long
     * 返回当前参与选举的server的id
     */
    private long getInitId(){
        if(self.getLearnerType() == LearnerType.PARTICIPANT)
            return self.getId();
        else return Long.MIN_VALUE;
    }

    /**
     * Returns initial last logged zxid.
     *
     * @return long
     * 返回当前server最后记录的zxid,即 max zxid
     */
    private long getInitLastLoggedZxid(){
        if(self.getLearnerType() == LearnerType.PARTICIPANT)
            return self.getLastLoggedZxid();
        else return Long.MIN_VALUE;
    }

    /**
     * Returns the initial vote value of the peer epoch.
     *
     * @return long
     * 返回当前server的epoch,即上一轮leader的epoch
     */
    private long getPeerEpoch(){
        if(self.getLearnerType() == LearnerType.PARTICIPANT)
        	try {
        		return self.getCurrentEpoch();
        	} catch(IOException e) {
        		RuntimeException re = new RuntimeException(e.getMessage());
        		re.setStackTrace(e.getStackTrace());
        		throw re;
        	}
        else return Long.MIN_VALUE;
    }
    
    /**
     * Starts a new round of leader election. Whenever our QuorumPeer
     * changes its state to LOOKING, this method is invoked, and it
     * sends notifications to all other peers.
     * 开启新一轮的leader选举。无论何时,只要我们的QuorumPeer的
     * 状态变为了LOOKING,那么这个方法将被调用,并且它会发送notifications
     * 给所有其它的同级服务器。
     */
    public Vote lookForLeader() throws InterruptedException {
        /*---1--- 创建选举对象,做选举前的初始化工作
        * jmx,Java Management eXtensions,Oracle提供的分布式应用程序监控技术*/
        try {
            self.jmxLeaderElectionBean = new LeaderElectionBean();
            MBeanRegistry.getInstance().register(
                    self.jmxLeaderElectionBean, self.jmxLocalPeerBean);
        } catch (Exception e) {
            LOG.warn("Failed to register with JMX", e);
            self.jmxLeaderElectionBean = null;
        }
        if (self.start_fle == 0) {
            /*推荐使用currentElapsedTime()获取时间,细粒度纳秒返回毫秒,获取时间源为JVM
            不推荐使用currentTimeMillis(),系统时间源,可修改;*/
            //获取与当前JVM高分辨率时间源相对的时间,单位为毫秒
           self.start_fle = Time.currentElapsedTime();
        }
        try {
            /*记录当前server收到的来自于其他server的本轮投票信息,
            * key为投票者的serverID,value为选票。
            * recvset集合就相当于票箱。*/
            HashMap<Long, Vote> recvset = new HashMap<Long, Vote>();
            /*用于记录所有退出(淘汰)选举的选票(不合法的选票)*/
            HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>();
            //notTimeout=》notification timeout
            int notTimeout = finalizeWait;
            /*---2--- 将自己作为新Leader投出去(首先自己选自己) */
            synchronized(this){
                //逻辑时钟加1(新一轮选举开始,逻辑时间必须更新)
                logicalclock.incrementAndGet();
                /* 更新选票:将自己推荐为初始化leader
                * */
                updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
            }

            LOG.info("New election. My id =  " + self.getId() +
                    ", proposed zxid=0x" + Long.toHexString(proposedZxid));
            //将更新过的选票放入队列(发送逻辑不在此处)
            sendNotifications();

            /*---3--- 验证当前自己的选票与其它的选票谁更适合做Leader(核心) */
            /*
             * Loop in which we exchange notifications until we find a leader
             */
            //当前server的状态必须为LOOKING并且选举未结束
            while ((self.getPeerState() == ServerState.LOOKING) &&
                    (!stop)){
                /*
                 * Remove next notification from queue, times out after 2 times
                 * the termination time
                 * recvqueue本质是一个带有头节点的链表
                 */
                /*发送和接收选票通知的逻辑为:当前server发送几个就要收回几个,收回的通知
                * 存放在recvqueue中;
                * 此处就是通过recvqueue.poll()方法从recvqueue中取出一个通知;*/
                Notification n = recvqueue.poll(notTimeout,
                        TimeUnit.MILLISECONDS);

                /*
                 * Sends more notifications if haven't received enough.
                 * Otherwise processes new notification.
                 */
                /*选举没有结束,但是n中的收回的消息通知为空,就发送更多的通知
                * 正常状态是当前server发送多少消息就会回收多少通知:即9个server,去掉自己
                * 共计发送8个消息,就会接收到8个通知;
                * 1 (不常见)处理了其中5个通知,到第6个时发送极端情况(if (n.electionEpoch > logicalclock.get())),
                * 如果发送者的epoch大于本轮选举逻辑时钟,说明本轮选举已经过时,致使后续处理未能过半,n就空了。
                * 2 (常见)当前server发送了8个消息,在超时时限内只收到了4个消息,即未能收全(未过半);*/
                if(n == null){
                    /*manager.haveDelivered()若返回true,说明当前server与集群连接没有问题*/
                    if(manager.haveDelivered()){
                        /*重新向外发送选票提案,目的是为了再次接收其它server通知,
                        * 收回的通知中有重复的不会放入recvset票箱,因为key相同,不接受;*/
                        sendNotifications();
                    /*manager.haveDelivered()若返回false,说明当前server与集群失联*/
                    } else {
                        //强制重连:连接其它server(没有重新发送选票消息)
                        // 连接所有其它Server,但没有重新发布通知,为什么呢?
                        // 当前Server若与集群失联,则其它Server一定不可能收到当前Server发送的通知,
                        // 所以那些Server就会执行前面的sendNotifications()代码,重新发送通知。那么,此时当前
                        // server就会收到那个Server所发送的通知。同理,就会收到其它所有Server所发送的通知
                        // 所以这里仅需连接上所有其它Server即可,不用重新发送通知。
                        /*设计思路(设计精妙):前提,选举未结束,不满半数,当前server失联,发送消息、接收通知都失败;
                        * 其它server向当前server发送的消息失败,那么就会选择重发,所有当前server重连后不需要发送消息,所有其它的server都会
                        * 向当前server发送消息,其会接收到所有选票消息,只要返回通知即完成了通信; */
                        manager.connectAll();
                    }

                    /*
                     * Exponential backoff
                     */
                    /*延长超时时限,最长不会超过60秒*/
                    int tmpTimeOut = notTimeout*2;
                    notTimeout = (tmpTimeOut < maxNotificationInterval?
                            tmpTimeOut : maxNotificationInterval);
                    LOG.info("Notification time out: " + notTimeout);
                }
                /*从回收选票消息队列recvqueue中取出的n不为空,
                * 1 validVoter(n.sid)验证当前取出的选票消息sid(发送者)是否合法
                * 2 validVoter(n.leader)验证发送者推荐的leader是否合法
                * 即验证通知发送者是否为有权限的参与者(validVoter中存储的都是PARTICIPANT参与者),
                * 并且验证PARTICIPANT参与者推荐的leader是否在validVoter中,即合法(是否拥有选举权和被选举权)*/
                else if(validVoter(n.sid) && validVoter(n.leader)) {
                    /*
                     * Only proceed if the vote comes from a replica in the
                     * voting view for a replica in the voting view.
                     */
                    switch (n.state) { //判断发送者当前状态
                    case LOOKING: //状态为LOOKING
                        // If notification > current, replace and send messages out
                        /*如果发送者的epoch大于本轮选举逻辑时钟,说明本轮选举已经过时
                        * n.electionEpoch为发送者的逻辑时钟,logicalclock.get()为当前server的逻辑时钟,
                        * 正常情况下为相等,它们都是当原有Leader挂掉后,其Epoch+1产生的新一轮逻辑时钟,
                        * 只是角度不同,叫法不同;
                        * 极端情况会造成值不同(本轮刚选举的leader未同步就挂掉)*/
                        if (n.electionEpoch > logicalclock.get()) {
                            //更新本轮选举的逻辑时钟,使已经过时的本轮选举变为当下选举
                            logicalclock.set(n.electionEpoch);
                            //清空投票信息(票箱)
                            recvset.clear();
                            /*判断n推荐的leader与当前server谁更适合做leader
                            * 即"我选我"的变形,无论是n还是当前server更适合做leader,
                            * 都需要更新选票提案,并将更新过的提案放入发布队列。*/
                            if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
                                    getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
                                updateProposal(n.leader, n.zxid, n.peerEpoch);//n适合
                            } else {//当前server适合
                                updateProposal(getInitId(),
                                        getInitLastLoggedZxid(),
                                        getPeerEpoch());
                            }
                            sendNotifications();
                        /*如果发送者的epoch小于当前的逻辑时钟,说明发送者参与的选举过时了
                        * n.electionEpoch为发送者的逻辑时钟,logicalclock.get()为当前server的逻辑时钟,
                        * 正常情况下为相等,它们都是当原有Leader挂掉后,其Epoch+1产生的新一轮逻辑时钟,
                        * 只是角度不同,叫法不同*/
                        } else if (n.electionEpoch < logicalclock.get()) {
                            if(LOG.isDebugEnabled()){
                                LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"
                                        + Long.toHexString(n.electionEpoch)
                                        + ", logicalclock=0x" + Long.toHexString(logicalclock.get()));
                            }
                            break;
                        /*比较发送者推荐的leader和我当前推荐的leader那个更适合做Leader,
                        * 我推荐更适合返回false,n发送者推荐更适合返回true*/
                        } else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
                                proposedLeader, proposedZxid, proposedEpoch)) {
                            //n推荐更适合,更新我当前的选票
                            updateProposal(n.leader, n.zxid, n.peerEpoch);
                            sendNotifications();
                        }

                        if(LOG.isDebugEnabled()){
                            LOG.debug("Adding vote: from=" + n.sid +
                                    ", proposed leader=" + n.leader +
                                    ", proposed zxid=0x" + Long.toHexString(n.zxid) +
                                    ", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
                        }
                        //将来自于外部其它server的选票放入票箱(记录)
                        recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));

                        /*---4--- 判断本轮选举是否应该结束 */
                        /*判断当前server的推荐提案在票箱中的支持数量是否过半*/
                        if (termPredicate(recvset,
                                new Vote(proposedLeader, proposedZxid,
                                        logicalclock.get(), proposedEpoch))) {

                            // Verify if there is any change in the proposed leader
                            //当前while()有两个出口
                            //1)循环条件:此处出去,说明在剩余的通知中没有找到任何比当前
                            //“过半的选票”更适合的通知;此处跳出时n未null。
                            //2)break:从此出去,说明在剩余的通知中找到一个比当前“过半的选票”
                            //更适合的通知所推荐的leader
                            while((n = recvqueue.poll(finalizeWait,
                                    TimeUnit.MILLISECONDS)) != null){
                                if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
                                        proposedLeader, proposedZxid, proposedEpoch)){
                                    recvqueue.put(n);
                                    break;//跳出,选举未结束,继续循环
                                }
                            }

                            /*
                             * This predicate is true once we don't read any new
                             * relevant message from the reception queue
                             */
                            /*若n==null为true,说明当前的“过半选票”就已经是最适合的Leader*/
                            if (n == null) {//找到Leader,收尾工作,修改当前主机状态
                                //先判断自己是否为当前过半选票,如果是就改变自己状态为LEADING;
                                //如果不是自己就改变状态为FOLLOWING
                                self.setPeerState((proposedLeader == self.getId()) ?
                                        ServerState.LEADING: learningState());
                                //形成最终选票
                                Vote endVote = new Vote(proposedLeader,
                                                        proposedZxid,
                                                        logicalclock.get(),
                                                        proposedEpoch);
                                //清空队列
                                leaveInstance(endVote);
                                return endVote;
                            }
                        }
                        break;//结束switch和case条件,本次while循环结束,开始新的while循环获取next选票进行验证
                    /*---5--- 无需选举的情况 */
                    //若一个server可以接受到n.state为OBSERVING状态的通知,说明该server是刚刚挂掉的Leader
                    //内容是心跳通讯
                    case OBSERVING:
                        LOG.debug("Notification from observer: " + n.sid);
                        break;
                    /* 有三种场景会出现FOLLOWING或LEADING状态消息:
                    *  1)新server(非Observer)加入到正常运行的集群中,其初始状态为LOOKING,默认调用lookForLeader()
                    *     方法发送推荐自己为leader的消息,当前集群中的Leader和follower收到消息后会给其回复通知,
                    *     通知的状态分别是FOLLOWING和LEADING;
                    *  2)当前Leader宕机,并不是所有的follower都同时能够感知到Leader挂掉,先感知到的server改变状态
                    *     为LOOKING,并发送消息给其它server,但其它server还未感知到,所以它们回复给感知到的server的
                    *     通知状态是FOLLOWING;
                    *  3)本轮选举中其它server已经选举出了新的Leader,并且已经改变了状态,但还没有通知到当前server,
                    *     已经选举完毕的server向当前server发送通知的状态就是LEADING或FOLLOWING。
                    * */
                    case FOLLOWING:
                    case LEADING:
                        /*
                         * Consider all notifications from the same epoch
                         * together.
                         */
                        /*处理外来通知的epoch与本轮选举的逻辑时钟相同的情况;
                        * 前面描述的场景3就是此处理范围;
                        * */
                        if(n.electionEpoch == logicalclock.get()){
                            //将通知选票放入“票箱”
                            recvset.put(n.sid, new Vote(n.leader,
                                                          n.zxid,
                                                          n.electionEpoch,
                                                          n.peerEpoch));

                            if(ooePredicate(recvset, outofelection, n)) {
                                self.setPeerState((n.leader == self.getId()) ?
                                        ServerState.LEADING: learningState());

                                Vote endVote = new Vote(n.leader, 
                                        n.zxid, 
                                        n.electionEpoch, 
                                        n.peerEpoch);
                                leaveInstance(endVote);
                                return endVote;
                            }
                        }

                        /*
                         * Before joining an established ensemble, verify
                         * a majority is following the same leader.
                         */
                        /*以下代码用于处理:一个server加入到已经选举出Leader的集群中(存在两种情况);
                        * 1)本轮选举过半,已经选出leader但还没有结束;
                        * 2)正常运行的集群,有新的server加入;
                        * 这两种情况下当前server接收到的通知都处于LEADING或FOLLOWING状态,
                        * 也就是LOOKING以外的状态,所有存入到outofelection集合中。
                        * */
                        outofelection.put(n.sid, new Vote(n.version,
                                                            n.leader,
                                                            n.zxid,
                                                            n.electionEpoch,
                                                            n.peerEpoch,
                                                            n.state));
                        /*此处判断,n在[1]outofelection中是否过半,n在[2]中是否正常,结果为true
                        * 就改变当前server状态,跟随n(Leader)*/
                        if(ooePredicate(outofelection, outofelection, n)) {
                            synchronized(this){
                                //更改当前server的逻辑时钟为n(Leader)的epoch
                                logicalclock.set(n.electionEpoch);
                                self.setPeerState((n.leader == self.getId()) ?
                                        ServerState.LEADING: learningState());
                            }
                            Vote endVote = new Vote(n.leader,
                                                    n.zxid,
                                                    n.electionEpoch,
                                                    n.peerEpoch);
                            leaveInstance(endVote);
                            return endVote;
                        }
                        break;
                    default:
                        LOG.warn("Notification state unrecognized: {} (n.state), {} (n.sid)",
                                n.state, n.sid);
                        break;
                    }
                } else {
                    if (!validVoter(n.leader)) {
                        LOG.warn("Ignoring notification for non-cluster member sid {} from sid {}", n.leader, n.sid);
                    }
                    if (!validVoter(n.sid)) {
                        LOG.warn("Ignoring notification for sid {} from non-quorum member sid {}", n.leader, n.sid);
                    }
                }
            }  //while循环-end
            return null;
        } finally {
            try {
                if(self.jmxLeaderElectionBean != null){
                    MBeanRegistry.getInstance().unregister(
                            self.jmxLeaderElectionBean);
                }
            } catch (Exception e) {
                LOG.warn("Failed to unregister with JMX", e);
            }
            self.jmxLeaderElectionBean = null;
            LOG.debug("Number of connection processing threads: {}",
                    manager.getConnectionThreadCount());
        }
    }

    /**
     * Check if a given sid is represented in either the current or
     * the next voting view
     *
     * @param sid     Server identifier
     * @return boolean
     */
    private boolean validVoter(long sid) {
        return self.getVotingView().containsKey(sid);
    }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值