Zookeeper启动与Leader选举源码阅读

选举流程图解

以三个节点的集群为例,节点1和节点2选举完成后,加入节点3
在这里插入图片描述

ZooKeeper选举线程模型

在这里插入图片描述

ZooKeeper服务的启动(branch-3.8.0)

ZooKeeper服务端启动类QuorumPeerMain

从bin目录下的zkServer.sh或zkServer.cmd里找到启动主类为QuorumPeerMain

		QuorumPeerMain main = new QuorumPeerMain();
		try {
		    main.initializeAndRun(args);
		}catch{...}

protected void initializeAndRun(String[] args),加载zookeeper配置,启动服务

        // 加载配置
        QuorumPeerConfig config = new QuorumPeerConfig();
        if (args.length == 1) {
            config.parse(args[0]);
        }
        ......
        if (args.length == 1 && config.isDistributed()) {
            // 集群入口
            runFromConfig(config);
        } else {
            // 单机入口
            LOG.warn("Either no config or no quorum defined in config, running in standalone mode");
            // there is only server in the quorum -- run as standalone
            ZooKeeperServerMain.main(args);
        }

public void runFromConfig(QuorumPeerConfig config) zookeeper集群启动

		//设置ServerCnxnFactory,默认为NIOServerCnxnFactory,可以通过zookeeper.serverCnxnFactory修改
		if (config.getClientPortAddress() != null) {
			// 初始化网络通信的相关配置,NettyServerCnxnFactory的默认handler为CnxnChannelHandler
		    cnxnFactory = ServerCnxnFactory.createFactory();
		    cnxnFactory.configure(config.getClientPortAddress(), config.getMaxClientCnxns(), config.getClientPortListenBacklog(), false);
		}
		......
		// 初始化服务配置,配置来自于配置文件或者默认值,继承自ZooKeeperThread
		quorumPeer = getQuorumPeer();
		......
		// 设置选举算法类型,默认为3
		quorumPeer.setElectionType(config.getElectionAlg());
		......
		// zookeeper内存数据库
		quorumPeer.setZKDatabase(new ZKDatabase(quorumPeer.getTxnFactory()));
		......
		quorumPeer.start();
		......

QuorumPeer

public synchronized void start(),QuorumPeer的启动方法

		// 从本地文件中加载历史数据
		loadDataBase();
		// 绑定NettyServerCnxnFactory的端口并启动
        startServerCnxnFactory();
        try {
            adminServer.start();
        } catch (AdminServerException e) {
            LOG.warn("Problem starting AdminServer", e);
        }
        // 启动Leader选举
        startLeaderElection();
        startJvmPauseMonitor();
        super.start();

public synchronized void startLeaderElection(),集群leader选举;

		// 服务端有四种状态LOOKING(寻找 Leader 状态,认为当前服务器没有 Leader,需要进行 Leader 选举)
		// FOLLOWING(跟随者,当前服务器角色是 Follower)、LEADING(领导者,当前服务器角色是 Leader)
		// OBSERVING(观察者,当前服务器角色是 Observer);启动后服务端默认状态为LOOKING
		if (getPeerState() == ServerState.LOOKING) {
			// 初始化选票,服务ID,最大的事务ID,当前选举周期
			currentVote = new Vote(myid, getLastLoggedZxid(), getCurrentEpoch());
		}
        // electionType在runFromConfig中设置,默认为3
        this.electionAlg = createElectionAlgorithm(electionType);

protected Election createElectionAlgorithm(int electionAlgorithm),创建选举算法并执行选举

		QuorumCnxManager qcm = createCnxnManager();
		......
		QuorumCnxManager.Listener listener = qcm.listener;
		if (listener != null) {
		    listener.start();
		    FastLeaderElection fle = new FastLeaderElection(this, qcm);
		    fle.start();
		    le = fle;
		} else {
		    LOG.error("Null listener when initializing cnx manager");
		}

super.start()即QuorumPeer的run方法

		......
        while (running) {
            ......
            switch (getPeerState()) {
            case LOOKING:
                ......
                // 执行FastLeaderElection.lookForLeader方法,执行完成后会返回Leader的选票
                setCurrentVote(makeLEStrategy().lookForLeader());
          		......
            case OBSERVING:
            	......
                setObserver(makeObserver(logFactory));
                observer.observeLeader();
                ......
            case FOLLOWING:
            	......
                setFollower(makeFollower(logFactory));
                // 建立与Leader的Socket,与Leader同步数据,接收Leader的数据
                follower.followLeader();
                ......
            case LEADING:
            	......
                setLeader(makeLeader(logFactory));
                // 建立数据通信的ServerSocket,与Follower建立通信,给Follower发送Ping消息
                leader.lead();
                setLeader(null);
                ......
            }
        }
        ......

FastLeaderElection leader选举类

关键属性

		// 存放需要发送的选票信息
		LinkedBlockingQueue<ToSend> sendqueue;
		// 存放收到的选票信息
	    LinkedBlockingQueue<Notification> recvqueue;
	    Messenger messenger;

Messenger的初始化方法

		this.ws = new WorkerSender(manager);
		this.wsThread = new Thread(this.ws, "WorkerSender[myid=" + self.getId() + "]");
		this.wsThread.setDaemon(true);
		this.wr = new WorkerReceiver(manager);
		this.wrThread = new Thread(this.wr, "WorkerReceiver[myid=" + self.getId() + "]");
		this.wrThread.setDaemon(true);

FastLeaderElection.start()调用Messenger.start(),Messenger.start()方法如下

		// 运行发送选票线程
		this.wsThread.start();
		// 运行接收选票线程
		this.wrThread.start();

FastLeaderElection.lookForLeader方法

		Map<Long, Vote> recvset = new HashMap<Long, Vote>();
		Map<Long, Vote> outofelection = new HashMap<Long, Vote>();
		int notTimeout = minNotificationInterval;
		synchronized (this) {
			// 选举周期+1
		    logicalclock.incrementAndGet();
		    // 更新当前节点的选票
		    updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
		}
		// 给其他节点发送选票(内容为当前节点的ID,最大事务ID,选举周期)
		sendNotifications();
		// 一直循环直到找到Leader
		while ((self.getPeerState() == ServerState.LOOKING) && (!stop)) {
			Notification n = recvqueue.poll(notTimeout, TimeUnit.MILLISECONDS);
			if (n == null) {
				......
				// 与其他节点简历连接
			    manager.connectAll();
			    ......
			}else if (validVoter(n.sid) && validVoter(n.leader)) {
				switch (n.state) {
				case LOOKING:
					if (n.electionEpoch > logicalclock.get()) {
						// 收到选票的周期 > 当前节点的周期
						// 设置当前节点的选票周期为收到的选票的周期
						logicalclock.set(n.electionEpoch);
						// 清理已经收到的选票
                        recvset.clear();
                        // 如果新的选票的事务ID+节点id的优先级大于当前节点的优先级,更新当前节点的选票为收到的选票
                        // 否则只更新当前选票的周期
                        if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
                            updateProposal(n.leader, n.zxid, n.peerEpoch);
                        } else {
                            updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
                        }
                        // 重新发送选票给其他节点
                        sendNotifications();
					}else if (n.electionEpoch < logicalclock.get()) {
						//选票周期 < 当前节点的周期,不处理
					}else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) {
						// 周期相同,且选票优先级高于当前节点时,更新当前节点的选票为收到的选票
						updateProposal(n.leader, n.zxid, n.peerEpoch);
						// 重新发送选票给其他节点
						sendNotifications();
					}
					// 更新sid的选票
					recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
					// 如果recvset中的选票和当前节点的选票相同,在收到选票的集合中加入此节点
					voteSet = getVoteTracker(recvset, new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch));
					// 选票过半,voteSet中包含过半节点
					if (voteSet.hasAllQuorums()) {
						// 等待一段时间看是否还有新的选票
						while ((n = recvqueue.poll(finalizeWait, TimeUnit.MILLISECONDS)) != null) {
						    if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch, proposedLeader, proposedZxid, proposedEpoch)) {
						        recvqueue.put(n);
						        break;
						    }
						}
						// 选举结束
						if (n == null) {
						    setPeerState(proposedLeader, voteSet);
						    Vote endVote = new Vote(proposedLeader, proposedZxid, logicalclock.get(), proposedEpoch);
						    leaveInstance(endVote);
						    return endVote;
						}
					}
				}
			}
		}
		// 选票的pick逻辑
		// 先比较选举周期,再比较事务ID,最后比较节点ID,大的优先
		protected boolean totalOrderPredicate(long newId, long newZxid, long newEpoch, long curId, long curZxid, long curEpoch) {
			return ((newEpoch > curEpoch)
                || ((newEpoch == curEpoch)
                  && ((newZxid > curZxid)
                        || ((newZxid == curZxid)
                            && (newId > curId)))));
		}
		// 判断选票是否过半
		public boolean hasAllQuorums() {
			// qvAcksetPairs的size应该为1
	    	for (QuorumVerifierAcksetPair qvAckset : qvAcksetPairs) {
	        	if (!qvAckset.getQuorumVerifier().containsQuorum(qvAckset.getAckset())) {
	            	return false;
	            }
	        }
	         return true;
	   }

private void sendNotifications(),生成需要发送给具有选举权的节点(包括自己)的选举信息并放入sendqueue队列

		// self.getCurrentAndNextConfigVoters()获取所有具有投票权的节点的ID集合
		for (long sid : self.getCurrentAndNextConfigVoters()) {
			QuorumVerifier qv = self.getQuorumVerifier();
			// 生成投票信息,目标为sid
			ToSend notmsg = new ToSend(
			    ToSend.mType.notification,
			    proposedLeader,
			    proposedZxid,
			    logicalclock.get(),
			    QuorumPeer.ServerState.LOOKING,
			    sid,
			    proposedEpoch,
			    qv.toString().getBytes(UTF_8));
			sendqueue.offer(notmsg);
		}
FastLeaderElection.WorkerSender

继承自ZooKeeperThread,发送选举消息线程

		public void run() {
			//获取需要发送的选票
			ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS);
			process(m);
		}
		void process(ToSend m) {
			ByteBuffer requestBuffer = buildMsg(m.state.ordinal(), m.leader, m.zxid, m.electionEpoch, m.peerEpoch, m.configData);
			manager.toSend(m.sid, requestBuffer);
		}
		public void toSend(Long sid, ByteBuffer b) {
		// 发送给自己的选票,直接放到接收消息的队列里
        if (this.mySid == sid) {
            b.position(0);
            addToRecvQueue(new Message(b.duplicate(), sid));
        } else {
        	// 将发送给其他节点的消息放到对应的阻塞队列中
            BlockingQueue<ByteBuffer> bq = queueSendMap.computeIfAbsent(sid, serverId -> new CircularBlockingQueue<>(SEND_CAPACITY));
            addToSendQueue(bq, b);
            // 如果不存在连接,新建连接
            connectOne(sid);
        }
FastLeaderElection.WorkerReceiver

继承自ZooKeeperThread,接收选举消息线程

		while (!stop) {
			
		}

QuorumCnxManager

QuorumCnxManager.connectAll,与其他节点建立连接

		// 将节点的sid和对应的发送信息的线程绑定
		final ConcurrentHashMap<Long, SendWorker> senderWorkerMap;
		
		public void connectAll(){
			for (Enumeration<Long> en = queueSendMap.keys(); en.hasMoreElements(); ) {
				sid = en.nextElement();
				connectOne(sid);
			}
		}
		// 与其他节点建立通信
		synchronized void connectOne(long sid) {
			// 与sid存在连接时,不再重复建立连接
			if (senderWorkerMap.get(sid) != null) {
				......
				return;
			}
			synchronized (self.QV_LOCK) {
				......
				if (connectOne(sid, lastProposedView.get(sid).electionAddr)) {
                    return;
                }
				......
			}
		}
		synchronized boolean connectOne(long sid, MultipleAddresses electionAddr) {
			......
			return initiateConnectionAsync(electionAddr, sid);
		}
		// 通过QuorumConnectionReqThread线程同其他节点异步的建立连接
		public boolean initiateConnectionAsync(final MultipleAddresses electionAddr, final Long sid){
			connectionExecutor.execute(new QuorumConnectionReqThread(electionAddr, sid));
		}
		
QuorumCnxManager.Listener

QuorumCnxManager.Listener继承自ZooKeeperThrea,与集群内的其他节点的选举端口创建BIO连接,并处理选举请求

		Set<InetSocketAddress> addresses;
		if (self.getQuorumListenOnAllIPs()) {
		    addresses = self.getElectionAddress().getWildcardAddresses();
		} else {
		    addresses = self.getElectionAddress().getAllAddresses();
		}
		// 对addresses中的每一个都生成一个ListenerHandler,ListenerHandler实现自Runnable
		// ListenerHandler因为异常达到最大的重试次数后,会调用自己的close方法和latch.countDown方法
		CountDownLatch latch = new CountDownLatch(addresses.size());
		listenerHandlers = addresses.stream().map(address ->
		                new ListenerHandler(address, self.shouldUsePortUnification(), self.isSslQuorum(), latch))
		        .collect(Collectors.toList());
		// 新建里集群节点数-1的线程池,并将ListenerHandler集合放入线程池中执行
		final ExecutorService executor = Executors.newFixedThreadPool(addresses.size());
		try {
		    listenerHandlers.forEach(executor::submit);
		} finally {
		    // 线程池会等待已经执行的线程执行完成才关闭
		    executor.shutdown();
		}
		......
		// 等待所有的ListenerHandler都结束
	    latch.await();
		......

ListenerHandler.run

		public void run() {
			try {
			    acceptConnections();
			 ......
			 } finally {
			    latch.countDown();
			}
		}

		private void acceptConnections() {
		    int numRetries = 0;
		    while ((!shutdown) && (portBindMaxRetry == 0 || numRetries < portBindMaxRetry)) {
		        try {
		        	// 建立socket连接
		            serverSocket = createNewServerSocket();
		            while (!shutdown) {
		            	......
	                    client = serverSocket.accept();
	                    ......
                    	// 处理连接请求
                        receiveConnection(client);
	                    numRetries = 0;
		                ......
		            }
		        } catch (IOException e) {
		            ......
		            numRetries++;
		            ......
		        }
		    }
		}
		public void receiveConnection(final Socket sock){
			din = new DataInputStream(new BufferedInputStream(sock.getInputStream()));
			handleConnection(sock, din);
		}
		private void handleConnection(Socket sock, DataInputStream din){
			// 请求连接的节点ID小于当前节点ID,如果存在连接,关闭连接;新建从当前节点到请求节点的连接
			if (sid < self.getId()){
				if (electionAddr != null) {
	                connectOne(sid, electionAddr);
	            } else {
	            	// 与sid建立连接,并新起发送和接收消息的线程
	                connectOne(sid);
	            }
			}else if (sid == self.getId()) {
				// 不存在这种情况,出现了说明有问题
	        } else { // Otherwise start worker threads to receive data.
	        	// 建立与请求节点关联的发送和接收消息的线程
	            SendWorker sw = new SendWorker(sock, sid);
	            RecvWorker rw = new RecvWorker(sock, din, sid, sw);
	            sw.setRecv(rw);
	            SendWorker vsw = senderWorkerMap.get(sid);
	            if (vsw != null) {
	                vsw.finish();
	            }
	            senderWorkerMap.put(sid, sw);
	            queueSendMap.putIfAbsent(sid, new CircularBlockingQueue<>(SEND_CAPACITY));
	            sw.start();
	            rw.start();
	        }
		}
		
QuorumCnxManager.QuorumConnectionReqThread,继承自ZooKeeperThread
		public void run() {
			initiateConnection(electionAddr, sid);
		}
		//
		public void initiateConnection(final MultipleAddresses electionAddr, final Long sid) {
			......
			//建立socket端口
			sock.connect(electionAddr.getReachableOrOne(), cnxTO);
			......
			// 通过sock建立接受和发送消息的线程
			startConnection(sock, sid);
			......
		}
		private boolean startConnection(Socket sock, Long sid){
			......
			// 只允许节点id大的向节点id小的建立连接
			if (sid > self.getId()) {
			    LOG.info("Have smaller server identifier, so dropping the connection: (myId:{} --> sid:{})", self.getId(), sid);
			    closeSocket(sock);
			    // Otherwise proceed with the connection
			} else {
				// 发送消息
			    SendWorker sw = new SendWorker(sock, sid);
			    // 接收消息
			    RecvWorker rw = new RecvWorker(sock, din, sid, sw);
			    sw.setRecv(rw);
			    senderWorkerMap.put(sid, sw);
			    // 发送的线程会绑定阻塞队列
			    queueSendMap.putIfAbsent(sid, new CircularBlockingQueue<>(SEND_CAPACITY));
			    sw.start();
			    rw.start();
			}
		}
QuorumCnxManager.SendWorker

给其他节点发送消息

		while (running && !shutdown && sock != null) {
			......
		    BlockingQueue<ByteBuffer> bq = queueSendMap.get(sid);
		    ......
		    // 从sid对应的阻塞队列中获取消息
	        b = pollSendQueue(bq, 1000, TimeUnit.MILLISECONDS);
		    ......
	        lastMessageSent.put(sid, b);
	        send(b);
	        ......
		}
QuorumCnxManager.RecvWorker

接收其他节点发送的消息

		while (running && !shutdown && sock != null) {
			......
		    int length = din.readInt();
		    ......
		    // 从sid对应的阻塞队列中获取消息
	        final byte[] msgArray = new byte[length];
	        din.readFully(msgArray, 0, length);
	        // 将从Socket中得到的消息放到QuorumCnxManager
	        addToRecvQueue(new Message(ByteBuffer.wrap(msgArray), sid));
	        ......
		}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值