一、zookeeper的顺序一致性
- google的chubby就是解决分布式环境下多个服务选举出leader时,由于网络环境的不可靠性,会出现丢失、篡改问题(拜占庭将军),因此服务间用paxos算法实现分布式锁和master选举。zk就是chubby的开源实现。
- zk是使用zab协议完成数据同步,并不是强一致性,而是简化版的顺序一致性,如图:
1.当客户端B和C在zk修改时去读取x的值,如果B读到x=0,那么C可能0或1;如果B读到x=1,C在B时间点后,那么C一定是x=1。这就是zk的顺序一致性
2.由于网络原因客户端同时发送请求可能有先后顺序,因此读到的值不一定是以客户端的时间轴为准。
3.zk不保证不同客户端的zk数据视图一模一样。假设A,B两个客户端,A将x=1改为2,B读取可能x=1,如果想B每次读取更新后的值,可以使用zk提供的sync方法。
4.zk基于zxid和阻塞队列保证请求的顺序一致性,客户端A请求zk读取数据,zk会返回数据和zxid给客户端,如果客户端A断开重连zk集群,连接到数据还未同步的follower上,那么客户端A用记录到最大的zxid拿去对比follower的zxid,发现自己的zxid比follower的大,那么就会连接失败。这算不算是对zk官网说到的single system image 的解释,一个客户端只要连接到zk,就不会去读到旧数据。
二、zk集群leader选举分析
-
我们在配置zk集群的时候,会在zk安装目录下/data下写入myid文件,假设三个节点,分别为1,2,3。再去/conf目录下配置zoo.cfg,新增server.1=127.0.0.1:2887:3887,server.2=127.0.0.1:2888:3888,server.3=127.0.0.1:2889:3889,分别代表不同的myid,第一个端口为zk集群通信,第二个端口为leader选举。
-
启动zk的时候会 执行:sh zkServer.sh 脚本,运行org.apache.zookeeper.server.quorum.QuorumPeerMain这个类的main方法。
1.1. 入口QuorumPeerMain的main方法 public static void main(String[] args) { QuorumPeerMain main = new QuorumPeerMain(); try { #1.2 //启动配置args,会传入zoo.cfg main.initializeAndRun(args); } catch (IllegalArgumentException e) { LOG.error("Invalid arguments, exiting abnormally", e); LOG.info(USAGE); System.err.println(USAGE); System.exit(2); } catch (ConfigException e) { LOG.error("Invalid config, exiting abnormally", e); System.err.println("Invalid config, exiting abnormally"); System.exit(2); } catch (Exception e) { LOG.error("Unexpected exception, exiting abnormally", e); System.exit(1); } LOG.info("Exiting normally"); System.exit(0); } 1.2. protected void initializeAndRun(String[] args) throws ConfigException, IOException { //保存zoo.cfg文件解析之后的所有参数(一定在后面有用到) QuorumPeerConfig config = new QuorumPeerConfig(); if (args.length == 1) { #1.3 config.parse(args[0]); } // ()Start and schedule the the purge task DatadirCleanupManager purgeMgr = new DatadirCleanupManager(config .getDataDir(), config.getDataLogDir(), config .getSnapRetainCount(), config.getPurgeInterval()); purgeMgr.start(); if (args.length == 1 && config.servers.size() > 0) { #1.4 //如果args==1,走这段代码 runFromConfig(config); } else { LOG.warn("Either no config or no quorum defined in config, running " + " in standalone mode"); // there is only server in the quorum -- run as standalone ZooKeeperServerMain.main(args); } } 1.3. public void parse(String path) throws ConfigException { File configFile = new File(path); LOG.info("Reading configuration from: " + configFile); try { if (!configFile.exists()) { throw new IllegalArgumentException(configFile.toString() + " file is missing"); } //将文件流转为properties Properties cfg = new Properties(); FileInputStream in = new FileInputStream(configFile); try { cfg.load(in); } finally { in.close(); } //解析配置文件 //会将三个节点server.1=ip:port1:port2保存起来 parseProperties(cfg); } catch (IOException e) { throw new ConfigException("Error processing " + path, e); } catch (IllegalArgumentException e) { throw new ConfigException("Error processing " + path, e); } } 1.4. 从配置中运行 public void runFromConfig(QuorumPeerConfig config) throws IOException { try { ManagedUtil.registerLog4jMBeans(); } catch (JMException e) { LOG.warn("Unable to register log4j JMX control", e); } LOG.info("Starting quorum peer"); try { //创建通信工厂,默认用的NIO,配置了就用Netty ServerCnxnFactory cnxnFactory = ServerCnxnFactory.createFactory(); #1.5 cnxnFactory.configure(config.getClientPortAddress(), config.getMaxClientCnxns()); quorumPeer = getQuorumPeer(); //getView() //zoo.cfg里面解析的servers节点 quorumPeer.setQuorumPeers(config.getServers()); quorumPeer.setTxnFactory(new FileTxnSnapLog( new File(config.getDataLogDir()), new File(config.getDataDir()))); //选举类型用的什么算法,默认是3 quorumPeer.setElectionType(config.getElectionAlg()); quorumPeer.setMyid(config.getServerId()); quorumPeer.setTickTime(config.getTickTime()); quorumPeer.setInitLimit(config.getInitLimit()); quorumPeer.setSyncLimit(config.getSyncLimit()); quorumPeer.setQuorumListenOnAllIPs(config.getQuorumListenOnAllIPs()); // 设置cnxnFacotory quorumPeer.setCnxnFactory(cnxnFactory); quorumPeer.setQuorumVerifier(config.getQuorumVerifier()); quorumPeer.setClientPortAddress(config.getClientPortAddress()); quorumPeer.setMinSessionTimeout(config.getMinSessionTimeout()); quorumPeer.setMaxSessionTimeout(config.getMaxSessionTimeout()); quorumPeer.setZKDatabase(new ZKDatabase(quorumPeer.getTxnFactory())); quorumPeer.setLearnerType(config.getPeerType()); quorumPeer.setSyncEnabled(config.getSyncEnabled()); // sets quorum sasl authentication configurations quorumPeer.setQuorumSaslEnabled(config.quorumEnableSasl); if(quorumPeer.isQuorumSaslAuthEnabled()){ quorumPeer.setQuorumServerSaslRequired(config.quorumServerRequireSasl); quorumPeer.setQuorumLearnerSaslRequired(config.quorumLearnerRequireSasl); quorumPeer.setQuorumServicePrincipal(config.quorumServicePrincipal); quorumPeer.setQuorumServerLoginContext(config.quorumServerLoginContext); quorumPeer.setQuorumLearnerLoginContext(config.quorumLearnerLoginContext); } quorumPeer.setQuorumCnxnThreadsSize(config.quorumCnxnThreadsSize); quorumPeer.initialize(); #1.6 quorumPeer.start(); quorumPeer.join(); } catch (InterruptedException e) { // warn, but generally this is ok LOG.warn("Quorum Peer interrupted", e); } } 1.5. 通信配置初始化 @Override public void configure(InetSocketAddress addr, int maxcc) throws IOException { configureSaslLogin(); thread = new ZooKeeperThread(this, "NIOServerCxn.Factory:" + addr); //设置守护线程,运行在用户线程上的守护线程,只要用户线程不结束,守护线程一直运行直到用户线程结束 thread.setDaemon(true); maxClientCnxns = maxcc; this.ss = ServerSocketChannel.open(); //端口可复用 ss.socket().setReuseAddress(true); LOG.info("binding to port " + addr); //绑定ip 和端口2181 ss.socket().bind(addr); //非阻塞 ss.configureBlocking(false); //注册一个accept事件 ss.register(selector, SelectionKey.OP_ACCEPT); } 1.6. QuorumPeer启动 @Override public synchronized void start() { //加载数据() loadDataBase(); #1.7 //cnxnFacotory 通信 客户端的2181端口号 cnxnFactory.start(); #2.1 //开始leader选举-> 启动一个投票的监听、初始化一个选举算法FastLeader. startLeaderElection(); #4.1 //当前的QuorumPeer继承Thread,调用Thread.start() ->QuorumPeer.run() super.start(); } 1.7. 启动线程,运行NIOServerCnxnFactory的run方法 //当收到客户端的create/delete/setdata请求时,会进入这个方法 public void run() { while (!ss.socket().isClosed()) { try { selector.select(1000); Set<SelectionKey> selected; synchronized (this) { selected = selector.selectedKeys(); } ArrayList<SelectionKey> selectedList = new ArrayList<SelectionKey>( selected); Collections.shuffle(selectedList); for (SelectionKey k : selectedList) { if ((k.readyOps() & SelectionKey.OP_ACCEPT) != 0) { SocketChannel sc = ((ServerSocketChannel) k .channel()).accept(); InetAddress ia = sc.socket().getInetAddress(); int cnxncount = getClientCnxnCount(ia); if (maxClientCnxns > 0 && cnxncount >= maxClientCnxns){ LOG.warn("Too many connections from " + ia + " - max is " + maxClientCnxns ); sc.close(); } else { LOG.info("Accepted socket connection from " + sc.socket().getRemoteSocketAddress()); sc.configureBlocking(false); SelectionKey sk = sc.register(selector, SelectionKey.OP_READ); NIOServerCnxn cnxn = createConnection(sc, sk); sk.attach(cnxn); addCnxn(cnxn); } } else if ((k.readyOps() & (SelectionKey.OP_READ | SelectionKey.OP_WRITE)) != 0) { NIOServerCnxn c = (NIOServerCnxn) k.attachment(); c.doIO(k); } else { if (LOG.isDebugEnabled()) { LOG.debug("Unexpected ops in select " + k.readyOps()); } } } selected.clear(); } catch (RuntimeException e) { LOG.warn("Ignoring unexpected runtime exception", e); } catch (Exception e) { LOG.warn("Ignoring exception", e); } } closeAll(); LOG.info("NIOServerCnxn factory exited run method"); } 2.1. 启动leader选举 synchronized public void startLeaderElection() { try { //构建一个票据, (myid ,zxid ,epoch),用来投票的。 currentVote = new Vote(myid, getLastLoggedZxid(), getCurrentEpoch()); } catch(IOException e) { RuntimeException re = new RuntimeException(e.getMessage()); re.setStackTrace(e.getStackTrace()); throw re; } //getView返回保存的地址Map(Long, QuorumServer)= (1, ip:) for (QuorumServer p : getView().values()) { if (p.id == myid) { myQuorumAddr = p.addr; //地址: 1 ->myQuorumAddr=192.168.13.102 break; } } if (myQuorumAddr == null) { throw new RuntimeException("My id " + myid + " not in the peer list"); } //选举的策略 if (electionType == 0) { try { udpSocket = new DatagramSocket(myQuorumAddr.getPort()); responder = new ResponderThread(); responder.start(); } catch (SocketException e) { throw new RuntimeException(e); } } #2.2 //创建选举算法 this.electionAlg = createElectionAlgorithm(electionType); } 2.2. protected Election createElectionAlgorithm(int electionAlgorithm){ Election le=null; //TODO: use a factory rather than a switch switch (electionAlgorithm) { case 0: le = new LeaderElection(this); break; case 1: le = new AuthFastLeaderElection(this); break; case 2: le = new AuthFastLeaderElection(this, true); break; case 3: //QuorumCnxManager用来接收投票的。职责划分明确 qcm = createCnxnManager(); QuorumCnxManager.Listener listener = qcm.listener; if(listener != null){ #2.3 //启动两个worker线程进行投票的收发 listener.start(); #3.1 //创建一个FastLeaderElection选举算法,处理投票 le = new FastLeaderElection(this, qcm); } else { LOG.error("Null listener when initializing cnx manager"); } break; default: assert false; } return le; } 2.3. Listener的run方法 @Override public void run() { int numRetries = 0; InetSocketAddress addr; while((!shutdown) && (numRetries < 3)){ try { ss = new ServerSocket(); ss.setReuseAddress(true); if (listenOnAllIPs) { int port = view.get(QuorumCnxManager.this.mySid) .electionAddr.getPort(); addr = new InetSocketAddress(port); } else { addr = view.get(QuorumCnxManager.this.mySid) .electionAddr; } LOG.info("My election bind port: " + addr.toString()); setName(view.get(QuorumCnxManager.this.mySid) .electionAddr.toString()); ss.bind(addr); while (!shutdown) { //阻塞在这里,等待连接 //先去3.1.看选举算法,待会再从选举算法回到2.4. Socket client = ss.accept(); setSockOpts(client); LOG.info("Received connection request " + client.getRemoteSocketAddress()); // Receive and handle the connection request // asynchronously if the quorum sasl authentication is // enabled. This is required because sasl server // authentication process may take few seconds to finish, // this may delay next peer connection requests. if (quorumSaslAuthEnabled) { receiveConnectionAsync(client); } else { #2.4 //接收连接 receiveConnection(client); } numRetries = 0; } } catch (IOException e) { LOG.error("Exception while listening", e); numRetries++; try { ss.close(); Thread.sleep(1000); } catch (IOException ie) { LOG.error("Error closing server socket", ie); } catch (InterruptedException ie) { LOG.error("Interrupted while sleeping. " + "Ignoring exception", ie); } } } LOG.info("Leaving listener"); if (!shutdown) { LOG.error("As I'm leaving the listener thread, " + "I won't be able to participate in leader " + "election any longer: " + view.get(QuorumCnxManager.this.mySid).electionAddr); } } 2.4. public void receiveConnection(final Socket sock) { DataInputStream din = null; try { din = new DataInputStream( new BufferedInputStream(sock.getInputStream())); #2.5 //处理接收到的数据包 handleConnection(sock, din); } catch (IOException e) { LOG.error("Exception handling connection, addr: {}, closing server connection", sock.getRemoteSocketAddress()); closeSocket(sock); } } 2.5. private void handleConnection(Socket sock, DataInputStream din) throws IOException { Long sid = null; try { // Read server id //myid sid = din.readLong(); if (sid < 0) { // this is not a server id but a protocol version (see ZOOKEEPER-1633) sid = din.readLong(); // next comes the #bytes in the remainder of the message // note that 0 bytes is fine (old servers) int num_remaining_bytes = din.readInt(); if (num_remaining_bytes < 0 || num_remaining_bytes > maxBuffer) { LOG.error("Unreasonable buffer length: {}", num_remaining_bytes); closeSocket(sock); return; } byte[] b = new byte[num_remaining_bytes]; // remove the remainder of the message from din int num_read = din.read(b); if (num_read != num_remaining_bytes) { LOG.error("Read only " + num_read + " bytes out of " + num_remaining_bytes + " sent by server " + sid); } } if (sid == QuorumPeer.OBSERVER_ID) { /* * Choose identifier at random. We need a value to identify * the connection. */ sid = observerCounter.getAndDecrement(); LOG.info("Setting arbitrary identifier to observer: " + sid); } } catch (IOException e) { closeSocket(sock); LOG.warn("Exception reading or writing challenge: " + e.toString()); return; } // do authenticating learner LOG.debug("Authenticating learner server.id: {}", sid); authServer.authenticate(sock, din); //防止重复去建立连接,只能myid大的去连接小的 //If wins the challenge, then close the new connection. if (sid < this.mySid) { /* * This replica might still believe that the connection to sid is * up, so we have to shut down the workers before trying to open a * new connection. */ SendWorker sw = senderWorkerMap.get(sid); if (sw != null) { sw.finish(); } /* * Now we start a new connection */ LOG.debug("Create new connection to server: " + sid); closeSocket(sock); //去建立连接 connectOne(sid); // Otherwise start worker threads to receive data. } else { SendWorker sw = new SendWorker(sock, sid); RecvWorker rw = new RecvWorker(sock, din, sid, sw); sw.setRecv(rw); SendWorker vsw = senderWorkerMap.get(sid); if(vsw != null) vsw.finish(); senderWorkerMap.put(sid, sw); queueSendMap.putIfAbsent(sid, new ArrayBlockingQueue<ByteBuffer>(SEND_CAPACITY)); sw.start(); rw.start(); return; } } 3.1. FastLeaderElection构造方法调用starter方法,创建两个阻塞队列并new Messenger private void starter(QuorumPeer self, QuorumCnxManager manager) { this.self = self; proposedLeader = -1; proposedZxid = -1; //发送队列 sendqueue = new LinkedBlockingQueue<ToSend>(); //接收队列 recvqueue = new LinkedBlockingQueue<Notification>(); #3.2 this.messenger = new Messenger(manager); } 3.2. Messenger构造 Messenger(QuorumCnxManager manager) { #3.3 //发送票据的线程(用于消费sendQueue) this.ws = new WorkerSender(manager); Thread t = new Thread(this.ws, "WorkerSender[myid=" + self.getId() + "]"); //守护线程 t.setDaemon(true); t.start(); #3.4 //接收票据的线程(用于消费recvqueue) this.wr = new WorkerReceiver(manager); t = new Thread(this.wr, "WorkerReceiver[myid=" + self.getId() + "]"); t.setDaemon(true); t.start(); } 3.3. WorkerSender public void run() { while (!stop) { try { //带有超时阻塞的机制去从阻塞队列中获得数据 ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS); if(m == null) continue; process(m); } catch (InterruptedException e) { break; } } LOG.info("WorkerSender is down"); } /** * Called by run() once there is a new message to send. * * @param m message to send */ void process(ToSend m) { ByteBuffer requestBuffer = buildMsg(m.state.ordinal(), m.leader, m.zxid, m.electionEpoch, m.peerEpoch); #3.3.1 manager.toSend(m.sid, requestBuffer); } 3.3.1 public void toSend(Long sid, ByteBuffer b) { /* * If sending message to myself, then simply enqueue it (loopback). */ if (this.mySid == sid) { b.position(0); addToRecvQueue(new Message(b.duplicate(), sid)); /* * Otherwise send to the corresponding thread to send. */ } else { /* * Start a new connection if doesn't have one already. */ ArrayBlockingQueue<ByteBuffer> bq = new ArrayBlockingQueue<ByteBuffer>(SEND_CAPACITY); ArrayBlockingQueue<ByteBuffer> bqExisting = queueSendMap.putIfAbsent(sid, bq); if (bqExisting != null) { //发送 addToSendQueue(bqExisting, b); } else { addToSendQueue(bq, b); } #3.3.2 //目标机器的sid connectOne(sid); } } 3.3.2 connectOne() → initiateConnection() → startConnection() private boolean startConnection(Socket sock, Long sid) throws IOException { DataOutputStream dout = null; DataInputStream din = null; try { // Sending id and challenge dout = new DataOutputStream(sock.getOutputStream()); dout.writeLong(this.mySid); dout.flush(); din = new DataInputStream( new BufferedInputStream(sock.getInputStream())); } catch (IOException e) { LOG.warn("Ignoring exception reading or writing challenge: ", e); closeSocket(sock); return false; } // authenticate learner authLearner.authenticate(sock, view.get(sid).hostname); 防止重复建立连接,myid大的主动连接myid小的 // If lost the challenge, then drop the new connection if (sid > this.mySid) { LOG.info("Have smaller server identifier, so dropping the " + "connection: (" + sid + ", " + this.mySid + ")"); closeSocket(sock); // Otherwise proceed with the connection } else { //开启线程,将WorkerSender创建的socket传入,用于发送票据,消费QuorumCnxManager中queueSendMap队列 SendWorker sw = new SendWorker(sock, sid); //开启线程,将收到的票据放入QuorumCnxManager的recvQueue队列中 RecvWorker rw = new RecvWorker(sock, din, sid, sw); sw.setRecv(rw); SendWorker vsw = senderWorkerMap.get(sid); if(vsw != null) vsw.finish(); senderWorkerMap.put(sid, sw); queueSendMap.putIfAbsent(sid, new ArrayBlockingQueue<ByteBuffer>(SEND_CAPACITY)); sw.start(); rw.start(); return true; } return false; } 3.4. WorkerReceiver public void run() { Message response; while (!stop) { // Sleeps on receive try{ //QuorumCnxManager中的recvQueue队列取出票据 response = manager.pollRecvQueue(3000, TimeUnit.MILLISECONDS); if(response == null) continue; /* * If it is from an observer, respond right away. * Note that the following predicate assumes that * if a server is not a follower, then it must be * an observer. If we ever have any other type of * learner in the future, we'll have to change the * way we check for observers. */ if(!validVoter(response.sid)){ Vote current = self.getCurrentVote(); ToSend notmsg = new ToSend(ToSend.mType.notification, current.getId(), current.getZxid(), logicalclock.get(), self.getPeerState(), response.sid, current.getPeerEpoch()); sendqueue.offer(notmsg); } else { // Receive new message if (LOG.isDebugEnabled()) { LOG.debug("Receive new notification message. My id = " + self.getId()); } /* * We check for 28 bytes for backward compatibility */ if (response.buffer.capacity() < 28) { LOG.error("Got a short response: " + response.buffer.capacity()); continue; } boolean backCompatibility = (response.buffer.capacity() == 28); response.buffer.clear(); // Instantiate Notification and set its attributes Notification n = new Notification(); // State of peer that sent this message QuorumPeer.ServerState ackstate = QuorumPeer.ServerState.LOOKING; switch (response.buffer.getInt()) { case 0: ackstate = QuorumPeer.ServerState.LOOKING; break; case 1: ackstate = QuorumPeer.ServerState.FOLLOWING; break; case 2: ackstate = QuorumPeer.ServerState.LEADING; break; case 3: ackstate = QuorumPeer.ServerState.OBSERVING; break; default: continue; } n.leader = response.buffer.getLong(); n.zxid = response.buffer.getLong(); n.electionEpoch = response.buffer.getLong(); n.state = ackstate; n.sid = response.sid; if(!backCompatibility){ n.peerEpoch = response.buffer.getLong(); } else { if(LOG.isInfoEnabled()){ LOG.info("Backward compatibility mode, server id=" + n.sid); } n.peerEpoch = ZxidUtils.getEpochFromZxid(n.zxid); } /* * Version added in 3.4.6 */ n.version = (response.buffer.remaining() >= 4) ? response.buffer.getInt() : 0x0; /* * Print notification info */ if(LOG.isInfoEnabled()){ printNotification(n); } /* * If this server is looking, then send proposed leader */ if(self.getPeerState() == QuorumPeer.ServerState.LOOKING){ recvqueue.offer(n); /* * Send a notification back if the peer that sent this * message is also looking and its logical clock is * lagging behind. */ if((ackstate == QuorumPeer.ServerState.LOOKING) && (n.electionEpoch < logicalclock.get())){ Vote v = getVote(); ToSend notmsg = new ToSend(ToSend.mType.notification, v.getId(), v.getZxid(), logicalclock.get(), self.getPeerState(), response.sid, v.getPeerEpoch()); sendqueue.offer(notmsg); } } else { /* * If this server is not looking, but the one that sent the ack * is looking, then send back what it believes to be the leader. */ Vote current = self.getCurrentVote(); if(ackstate == QuorumPeer.ServerState.LOOKING){ if(LOG.isDebugEnabled()){ LOG.debug("Sending new notification. My id = " + self.getId() + " recipient=" + response.sid + " zxid=0x" + Long.toHexString(current.getZxid()) + " leader=" + current.getId()); } ToSend notmsg; if(n.version > 0x0) { notmsg = new ToSend( ToSend.mType.notification, current.getId(), current.getZxid(), current.getElectionEpoch(), self.getPeerState(), response.sid, current.getPeerEpoch()); } else { Vote bcVote = self.getBCVote(); notmsg = new ToSend( ToSend.mType.notification, bcVote.getId(), bcVote.getZxid(), bcVote.getElectionEpoch(), self.getPeerState(), response.sid, bcVote.getPeerEpoch()); } sendqueue.offer(notmsg); } } } } catch (InterruptedException e) { System.out.println("Interrupted Exception while waiting for new message" + e.toString()); } } LOG.info("WorkerReceiver is down"); } 4.1. 启动QuorumPeer线程 进行选举 @Override public void run() { setName("QuorumPeer" + "[myid=" + getId() + "]" + cnxnFactory.getLocalAddress()); LOG.debug("Starting quorum peer"); try { jmxQuorumBean = new QuorumBean(this); MBeanRegistry.getInstance().register(jmxQuorumBean, null); for(QuorumServer s: getView().values()){ ZKMBeanInfo p; if (getId() == s.id) { p = jmxLocalPeerBean = new LocalPeerBean(this); try { MBeanRegistry.getInstance().register(p, jmxQuorumBean); } catch (Exception e) { LOG.warn("Failed to register with JMX", e); jmxLocalPeerBean = null; } } else { p = new RemotePeerBean(s); try { MBeanRegistry.getInstance().register(p, jmxQuorumBean); } catch (Exception e) { LOG.warn("Failed to register with JMX", e); } } } } catch (Exception e) { LOG.warn("Failed to register with JMX", e); jmxQuorumBean = null; } try { /* * Main loop * 死循环 */ while (running) { //第一次启动的时候,LOOKING switch (getPeerState()) { case LOOKING: LOG.info("LOOKING"); if (Boolean.getBoolean("readonlymode.enabled")) { LOG.info("Attempting to start ReadOnlyZooKeeperServer"); // Create read-only server but don't start it immediately final ReadOnlyZooKeeperServer roZk = new ReadOnlyZooKeeperServer( logFactory, this, new ZooKeeperServer.BasicDataTreeBuilder(), this.zkDb); // Instead of starting roZk immediately, wait some grace // period before we decide we're partitioned. // // Thread is used here because otherwise it would require // changes in each of election strategy classes which is // unnecessary code coupling. Thread roZkMgr = new Thread() { public void run() { try { // lower-bound grace period to 2 secs sleep(Math.max(2000, tickTime)); if (ServerState.LOOKING.equals(getPeerState())) { roZk.startup(); } } catch (InterruptedException e) { LOG.info("Interrupted while attempting to start ReadOnlyZooKeeperServer, not started"); } catch (Exception e) { LOG.error("FAILED to start ReadOnlyZooKeeperServer", e); } } }; try { roZkMgr.start(); setBCVote(null); setCurrentVote(makeLEStrategy().lookForLeader()); } catch (Exception e) { LOG.warn("Unexpected exception",e); setPeerState(ServerState.LOOKING); } finally { // If the thread is in the the grace period, interrupt // to come out of waiting. roZkMgr.interrupt(); roZk.shutdown(); } } else { try { setBCVote(null); # 4.1.1 //setCurrentVote -> 确定了谁是leader了。 setCurrentVote(makeLEStrategy().lookForLeader()); } catch (Exception e) { LOG.warn("Unexpected exception", e); setPeerState(ServerState.LOOKING); } } break; case OBSERVING: try { LOG.info("OBSERVING"); setObserver(makeObserver(logFactory)); observer.observeLeader(); } catch (Exception e) { LOG.warn("Unexpected exception",e ); } finally { observer.shutdown(); setObserver(null); setPeerState(ServerState.LOOKING); } break; case FOLLOWING: try { LOG.info("FOLLOWING"); setFollower(makeFollower(logFactory)); follower.followLeader(); //连接到leader } catch (Exception e) { LOG.warn("Unexpected exception",e); } finally { follower.shutdown(); setFollower(null); setPeerState(ServerState.LOOKING); } break; case LEADING: LOG.info("LEADING"); try { setLeader(makeLeader(logFactory)); leader.lead(); //lead 状态 setLeader(null); } catch (Exception e) { LOG.warn("Unexpected exception",e); } finally { if (leader != null) { leader.shutdown("Forcing shutdown"); setLeader(null); } setPeerState(ServerState.LOOKING); } break; } } } finally { LOG.warn("QuorumPeer main thread exited"); try { MBeanRegistry.getInstance().unregisterAll(); } catch (Exception e) { LOG.warn("Failed to unregister with JMX", e); } jmxQuorumBean = null; jmxLocalPeerBean = null; } } 4.1.1 public Vote lookForLeader() throws InterruptedException { try { self.jmxLeaderElectionBean = new LeaderElectionBean(); MBeanRegistry.getInstance().register( self.jmxLeaderElectionBean, self.jmxLocalPeerBean); } catch (Exception e) { LOG.warn("Failed to register with JMX", e); self.jmxLeaderElectionBean = null; } if (self.start_fle == 0) { self.start_fle = Time.currentElapsedTime(); } try { //接收到的票据的集合 HashMap<Long, Vote> recvset = new HashMap<Long, Vote>(); // HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>(); int notTimeout = finalizeWait; synchronized(this){ //逻辑时钟->epoch logicalclock.incrementAndGet(); //proposal updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch()); } LOG.info("New election. My id = " + self.getId() + ", proposed zxid=0x" + Long.toHexString(proposedZxid)); //广播自己票据 sendNotifications(); /* * Loop in which we exchange notifications until we find a leader */ //接收到了票据 while ((self.getPeerState() == ServerState.LOOKING) && (!stop)){ /* * Remove next notification from queue, times out after 2 times * the termination time */ //recvqueue是从网络上接收到的其他机器的Notification Notification n = recvqueue.poll(notTimeout, TimeUnit.MILLISECONDS); /* * Sends more notifications if haven't received enough. * Otherwise processes new notification. */ if(n == null){ if(manager.haveDelivered()){ sendNotifications(); } else { manager.connectAll();//重新连接集群中的所有节点 } /* * Exponential backoff */ int tmpTimeOut = notTimeout*2; notTimeout = (tmpTimeOut < maxNotificationInterval? tmpTimeOut : maxNotificationInterval); LOG.info("Notification time out: " + notTimeout); } else if(validVoter(n.sid) && validVoter(n.leader)) {//判断是否是一个有效的票据 /* * Only proceed if the vote comes from a replica in the * voting view for a replica in the voting view. */ switch (n.state) { case LOOKING: //第一次进入到这个case // If notification > current, replace and send messages out if (n.electionEpoch > logicalclock.get()) { // logicalclock.set(n.electionEpoch); recvset.clear();//清空 //收到票据之后,当前的server要听谁的。 //可能是听server1的、也可能是听server2,也可能是听server3 //zab leader选举算法 if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) { //把自己的票据更新成对方的票据,那么下一次,发送的票据就是新的票据 updateProposal(n.leader, n.zxid, n.peerEpoch); } else { //收到的票据小于当前的节点的票据,下一次发送票据,仍然发送自己的 updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch()); } //继续发送通知 sendNotifications(); //说明当前接收到的票据已经过期了,直接丢弃 } else if (n.electionEpoch < logicalclock.get()) { if(LOG.isDebugEnabled()){ LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x" + Long.toHexString(n.electionEpoch) + ", logicalclock=0x" + Long.toHexString(logicalclock.get())); } break; //这个判断表示收到的票据的 epoch 是相同的, //那么按照 epoch、zxid、myid 顺序进行比较比较成功以后,把对方的票据信息更新到自己的节点 } else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) { updateProposal(n.leader, n.zxid, n.peerEpoch); sendNotifications(); } if(LOG.isDebugEnabled()){ LOG.debug("Adding vote: from=" + n.sid + ", proposed leader=" + n.leader + ", proposed zxid=0x" + Long.toHexString(n.zxid) + ", proposed election epoch=0x" + Long.toHexString(n.electionEpoch)); } recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch)); //决断时刻(当前节点的更新后的vote信息,和recvset集合中的票据进行归纳,) if (termPredicate(recvset, new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch))) { // Verify if there is any change in the proposed leader //进入这个判断,说明选票达到了 leader 选举的要求 //在更新状态之前,服务器会等待 finalizeWait 毫秒时间来接收新的选票,以防止漏下关键选票。 //如果收到可能改变 Leader 的新选票,则重新进行计票 while((n = recvqueue.poll(finalizeWait, TimeUnit.MILLISECONDS)) != null){ if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)){ recvqueue.put(n); break; } } /* * This predicate is true once we don't read any new * relevant message from the reception queue */ //如果 notifaction 为空,说明Leader节点确定好了 if (n == null) { //设置当前当前节点的状态(判断 leader 节点是不是我自己,如果是,直接更新当前节点的 state 为 LEADING) //否则,根据当前节点的特性进行判断,决定是FOLLOWING 还是 OBSERVING self.setPeerState((proposedLeader == self.getId()) ? ServerState.LEADING: learningState()); Vote endVote = new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch); leaveInstance(endVote); //返回这次选举的票据 return endVote; } } break; case OBSERVING: LOG.debug("Notification from observer: " + n.sid); break; case FOLLOWING: case LEADING: /* * Consider all notifications from the same epoch * together. */ if(n.electionEpoch == logicalclock.get()){ recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch)); if(ooePredicate(recvset, outofelection, n)) { self.setPeerState((n.leader == self.getId()) ? ServerState.LEADING: learningState()); Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch); leaveInstance(endVote); return endVote; } } /* * Before joining an established ensemble, verify * a majority is following the same leader. */ outofelection.put(n.sid, new Vote(n.version, n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state)); if(ooePredicate(outofelection, outofelection, n)) { synchronized(this){ logicalclock.set(n.electionEpoch); self.setPeerState((n.leader == self.getId()) ? ServerState.LEADING: learningState()); } Vote endVote = new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch); leaveInstance(endVote); return endVote; } break; default: LOG.warn("Notification state unrecognized: {} (n.state), {} (n.sid)", n.state, n.sid); break; } } else { if (!validVoter(n.leader)) { LOG.warn("Ignoring notification for non-cluster member sid {} from sid {}", n.leader, n.sid); } if (!validVoter(n.sid)) { LOG.warn("Ignoring notification for sid {} from non-quorum member sid {}", n.leader, n.sid); } } } return null; } finally { try { if(self.jmxLeaderElectionBean != null){ MBeanRegistry.getInstance().unregister( self.jmxLeaderElectionBean); } } catch (Exception e) { LOG.warn("Failed to unregister with JMX", e); } self.jmxLeaderElectionBean = null; LOG.debug("Number of connection processing threads: {}", manager.getConnectionThreadCount()); } }
-
投票 流程图
-
zk选举流程图