- followLeader流程
Leader选举出来之后,FOLLOWING状态的zookeeper服务会创建一个Follower实例,然后调用followLeader()方法进行followLeader流程,接下来看一下followLeader()方法:
void followLeader() throws InterruptedException {
self.end_fle = Time.currentElapsedTime();
long electionTimeTaken = self.end_fle - self.start_fle;
self.setElectionTimeTaken(electionTimeTaken);
ServerMetrics.getMetrics().ELECTION_TIME.add(electionTimeTaken);
LOG.info("FOLLOWING - LEADER ELECTION TOOK - {} {}", electionTimeTaken,
QuorumPeer.FLE_TIME_UNIT);
self.start_fle = 0;
self.end_fle = 0;
fzk.registerJMX(new FollowerBean(this, zk), self.jmxLocalPeerBean);
try {
self.setZabState(QuorumPeer.ZabState.DISCOVERY);
QuorumServer leaderServer = findLeader();
try {
connectToLeader(leaderServer.addr, leaderServer.hostname);
long newEpochZxid = registerWithLeader(Leader.FOLLOWERINFO);
if (self.isReconfigStateChange())
throw new Exception("learned about role change");
//check to see if the leader zxid is lower than ours
//this should never happen but is just a safety check
long newEpoch = ZxidUtils.getEpochFromZxid(newEpochZxid);
if (newEpoch < self.getAcceptedEpoch()) {
LOG.error("Proposed leader epoch " + ZxidUtils.zxidToString(newEpochZxid)
+ " is less than our accepted epoch " + ZxidUtils.zxidToString(self.getAcceptedEpoch()));
throw new IOException("Error: Epoch of leader is lower");
}
long startTime = Time.currentElapsedTime();
try {
self.setLeaderAddressAndId(leaderServer.addr, leaderServer.getId());
self.setZabState(QuorumPeer.ZabState.SYNCHRONIZATION);
syncWithLeader(newEpochZxid);
self.setZabState(QuorumPeer.ZabState.BROADCAST);
} finally {
long syncTime = Time.currentElapsedTime() - startTime;
ServerMetrics.getMetrics().FOLLOWER_SYNC_TIME.add(syncTime);
}
if (self.getObserverMasterPort() > 0) {
LOG.info("Starting ObserverMaster");
om = new ObserverMaster(self, fzk, self.getObserverMasterPort());
om.start();
} else {
om = null;
}
// create a reusable packet to reduce gc impact
QuorumPacket qp = new QuorumPacket();
while (this.isRunning()) {
readPacket(qp);
processPacket(qp);
}
} catch (Exception e) {
LOG.warn("Exception when following the leader", e);
closeSocket();
// clear pending revalidations
pendingRevalidations.clear();
}
} finally {
if (om != null) {
om.stop();
}
zk.unregisterJMX((Learner)this);
}
}
分析如下:
- 1.统计Leader选举耗时并重置计时字段
- 2.给Follower注册JMX服务
- 3.调用findLeader方法查找作为Leader的zookeeper服务的相关端口信息
- 4.调用connectToLeader方法跟Leader建立连接,端口为投票端口
- 5.调用registerWithLeader方法注册Learner,获取新的集群纪元
- 6.接下来进行Follower与LearnerMaster之间的数据同步并进行数据同步耗时统计,数据同步具体细节将在后面的章节进行讲解
- 7.依据是否设置了ObserverMaster端口来决定是否启动ObserverMaster服务
- 8.循环处理LearnerMaster发来的数据包
查找Leader服务:findLeader
protected QuorumServer findLeader() {
QuorumServer leaderServer = null;
// Find the leader by id
Vote current = self.getCurrentVote();
for (QuorumServer s : self.getView().values()) {
if (s.id == current.getId()) {
// Ensure we have the leader's correct IP address before
// attempting to connect.
s.recreateSocketAddresses();
leaderServer = s;
break;
}
}
if (leaderServer == null) {
LOG.warn("Couldn't find the leader with id = "
+ current.getId());
}
return leaderServer;
}
// QuorumPeer.run()方法中的一段代码
try {
reconfigFlagClear();
if (shuttingDownLE) {
shuttingDownLE = false;
startLeaderElection();
}
setCurrentVote(makeLEStrategy().lookForLeader());
} catch (Exception e) {
LOG.warn("Unexpected exception", e);
setPeerState(ServerState.LOOKING);
}
分析如下:
- 1.在Leader选举出来之后每个参与选举的Learner会设置当前选票为选举出来的Leader
- 2.获取当前选票并执行DNS查找投票地址和选举地址
- 3.结束查找并返回包含了Leader端口信息的QuorumServer
跟Leader建立连接:connectToLeader
protected void connectToLeader(InetSocketAddress addr, String hostname)
throws IOException, InterruptedException, X509Exception {
this.sock = createSocket();
// leader connection timeout defaults to tickTime * initLimit
int connectTimeout = self.tickTime * self.initLimit;
// but if connectToLearnerMasterLimit is specified, use that value to calculate
// timeout instead of using the initLimit value
if (self.connectToLearnerMasterLimit > 0) {
connectTimeout = self.tickTime * self.connectToLearnerMasterLimit;
}
int remainingTimeout;
long startNanoTime = nanoTime();
for (int tries = 0; tries < 5; tries++) {
try {
// recalculate the init limit time because retries sleep for 1000 milliseconds
remainingTimeout = connectTimeout - (int)((nanoTime() - startNanoTime) / 1000000);
if (remainingTimeout <= 0) {
LOG.error("connectToLeader exceeded on retries.");
throw new IOException("connectToLeader exceeded on retries.");
}
sockConnect(sock, addr, Math.min(connectTimeout, remainingTimeout));
if (self.isSslQuorum()) {
((SSLSocket) sock).startHandshake();
}
sock.setTcpNoDelay(nodelay);
break;
} catch (IOException e) {
remainingTimeout = connectTimeout - (int)((nanoTime() - startNanoTime) / 1000000);
if (remainingTimeout <= 1000) {
LOG.error("Unexpected exception, connectToLeader exceeded. tries=" + tries +
", remaining init limit=" + remainingTimeout +
", connecting to " + addr,e);
throw e;
} else if (tries >= 4) {
LOG.error("Unexpected exception, retries exceeded. tries=" + tries +
", remaining init limit=" + remainingTimeout +
", connecting to " + addr,e);
throw e;
} else {
LOG.warn("Unexpected exception, tries=" + tries +
", remaining init limit=" + remainingTimeout +
", connecting to " + addr,e);
this.sock = createSocket();
}
}
Thread.sleep(leaderConnectDelayDuringRetryMs);
}
self.authLearner.authenticate(sock, hostname);
leaderIs = BinaryInputArchive.getArchive(new BufferedInputStream(
sock.getInputStream()));
bufferedOutput = new BufferedOutputStream(sock.getOutputStream());
leaderOs = BinaryOutputArchive.getArchive(bufferedOutput);
}
private Socket createSocket() throws X509Exception, IOException {
Socket sock;
if (self.isSslQuorum()) {
sock = self.getX509Util().createSSLSocket();
} else {
sock = new Socket();
}
sock.setSoTimeout(self.tickTime * self.initLimit);
return sock;
}
protected void sockConnect(Socket sock, InetSocketAddress addr, int timeout)
throws IOException {
sock.connect(addr, timeout);
}
分析如下:
- 1.根据是否设置了SSL认证创建相应的Socket实例并设置超时时间为initLimit * tickTime
- 2.设置连接超时时间以及连接开始时间(连接超时时间可以通过connectToLearnerMasterLimit参数进行设置,默认为initLimit * tickTime)
- 3.尝试跟Leader建立连接,连接失败时,有以下几种情况:
- 当剩余超时时间小于等于1秒时将直接抛出异常中断此次连接
- 当重试次数达到上限时直接抛出异常中断此次连接
- 重新创建Socket实例,然后线程陷入睡眠状态,睡眠时间默认100毫秒(可通过zookeeper.leaderConnectDelayDuringRetryMs参数进行设置),线程睡眠结束之后将进行连接重试
- 4.连接成功之后,进行auth认证
- 5.认证通过之后,获取自定义jute协议中的BinaryInputArchive和BinaryOutputArchive以用作数据读写
Learner与Leader交互的读包写包
void writePacket(QuorumPacket pp, boolean flush) throws IOException {
synchronized (leaderOs) {
if (pp != null) {
leaderOs.writeRecord(pp, "packet");
}
if (flush) {
bufferedOutput.flush();
}
}
}
void readPacket(QuorumPacket pp) throws IOException {
synchronized (leaderIs) {
leaderIs.readRecord(pp, "packet");
}
long traceMask = ZooTrace.SERVER_PACKET_TRACE_MASK;
if (pp.getType() == Leader.PING) {
traceMask = ZooTrace.SERVER_PING_TRACE_MASK;
}
if (LOG.isTraceEnabled()) {
ZooTrace.logQuorumPacket(LOG, traceMask, 'i', pp);
}
}
- Learner读包的时候会锁定leaderIs这个输入流,只有获取到锁之后才能进行读取,这样做可以避免数据包读取混乱的情况,读取完成之后会进行一些相关的数据统计
- Learner写包的时候也会锁定leaderOs这个输出流,只有获取到锁之后才能进行写操作,这样做可以避免缓冲区写入混乱的情况,然后再根据参数flush来决定是否刷新缓冲区将数据包发送出去
向Leader注册Learner信息:registerWithLeader
protected long registerWithLeader(int pktType) throws IOException{
/*
* Send follower info, including last zxid and sid
*/
long lastLoggedZxid = self.getLastLoggedZxid();
QuorumPacket qp = new QuorumPacket();
qp.setType(pktType);
qp.setZxid(ZxidUtils.makeZxid(self.getAcceptedEpoch(), 0));
/*
* Add sid to payload
*/
LearnerInfo li = new LearnerInfo(self.getId(), 0x10000, self.getQuorumVerifier().getVersion());
ByteArrayOutputStream bsid = new ByteArrayOutputStream();
BinaryOutputArchive boa = BinaryOutputArchive.getArchive(bsid);
boa.writeRecord(li, "LearnerInfo");
qp.setData(bsid.toByteArray());
writePacket(qp, true);
readPacket(qp);
final long newEpoch = ZxidUtils.getEpochFromZxid(qp.getZxid());
if (qp.getType() == Leader.LEADERINFO) {
// we are connected to a 1.0 server so accept the new epoch and read the next packet
leaderProtocolVersion = ByteBuffer.wrap(qp.getData()).getInt();
byte epochBytes[] = new byte[4];
final ByteBuffer wrappedEpochBytes = ByteBuffer.wrap(epochBytes);
if (newEpoch > self.getAcceptedEpoch()) {
wrappedEpochBytes.putInt((int)self.getCurrentEpoch());
self.setAcceptedEpoch(newEpoch);
} else if (newEpoch == self.getAcceptedEpoch()) {
// since we have already acked an epoch equal to the leaders, we cannot ack
// again, but we still need to send our lastZxid to the leader so that we can
// sync with it if it does assume leadership of the epoch.
// the -1 indicates that this reply should not count as an ack for the new epoch
wrappedEpochBytes.putInt(-1);
} else {
throw new IOException("Leaders epoch, " + newEpoch + " is less than accepted epoch, " + self.getAcceptedEpoch());
}
QuorumPacket ackNewEpoch = new QuorumPacket(Leader.ACKEPOCH, lastLoggedZxid, epochBytes, null);
writePacket(ackNewEpoch, true);
return ZxidUtils.makeZxid(newEpoch, 0);
} else {
if (newEpoch > self.getAcceptedEpoch()) {
self.setAcceptedEpoch(newEpoch);
}
if (qp.getType() != Leader.NEWLEADER) {
LOG.error("First packet should have been NEWLEADER");
throw new IOException("First packet should have been NEWLEADER");
}
return qp.getZxid();
}
}
分析如下:
- 1.创建一个FOLLOWERINFO数据包,Follower上次在集群运行中接受的Leader的newEpoch纪元信息包装为zxid,Learner的信息作为数据包的data字段,Learner信息包含了serverId、协议版本(0x10000)、投票验证器的版本号,然后将这个FOLLOWERINFO数据包发送给Leader
- 2.阻塞读取Leader发来的数据包,读取到之后从数据包的zxid获取Leader的newEpoch,然后数据包分两种情况(可以跟上一章节印证):
- 名词解释:此代码段中acceptedEpoch代表上次集群运行中接收到来自Leader的newEpoch,currentEpoch亦是上次集群运行纪元
- LEADERINFO:Leader发来的数据包是LEADERINFO,接下来将获取Leader的传输协议版本,然后对比Leader发来的newEpoch跟当前Follower服务的acceptedEpoch,如果newEpoch大于acceptedEpoch则重新设置acceptedEpoch为newEpoch并且设置接下来要发送的ACKEPOCH数据包的data字段为currentEpoch;如果newEpoch等于acceptedEpoch,那么设置接下来要发送的ACKEPOCH数据包的data字段为-1,表示不作为此次newEpoch的选票ack;如果newEpoch小于acceptedEpoch,那么直接抛出异常;接下来构建一个ACKEPOCH数据包并发送给Leader
- Leader发来的数据包不是LEADERINFO,那么判断newEpoch是否大于当前Follower服务的acceptedEpoch,如果大于则重新设置acceptedEpoch为newEpoch,并判断数据包类型是不是NEWLEADER,不是将抛出异常,然后会重新进入LOOKING状态寻找Leader(LearnerHandler在处理Learner信息时,如果版本号小于0x10000时将跳过LEADERINFO以及ACKEPOCH)
注意:如果newEpoch比当前Followe服务的acceptedEpoch更新,说明当前Follower对应的LearnerHandler调用getEpochToPropose时处于超过半数的Follower中,如果相等则表示当前Follower对应的LearnerHandler调用getEpochToPropose时未参与到newEpoch投票(也就是投票已经结束,集群newEpoch已经确定),小于的话则会重新进入LOOKING状态寻找Leader
循环处理数据包
protected void processPacket(QuorumPacket qp) throws Exception{
switch (qp.getType()) {
case Leader.PING:
ping(qp);
break;
case Leader.PROPOSAL:
ServerMetrics.getMetrics().LEARNER_PROPOSAL_RECEIVED_COUNT.add(1);
TxnHeader hdr = new TxnHeader();
Record txn = SerializeUtils.deserializeTxn(qp.getData(), hdr);
if (hdr.getZxid() != lastQueued + 1) {
LOG.warn("Got zxid 0x"
+ Long.toHexString(hdr.getZxid())
+ " expected 0x"
+ Long.toHexString(lastQueued + 1));
}
lastQueued = hdr.getZxid();
if (hdr.getType() == OpCode.reconfig){
SetDataTxn setDataTxn = (SetDataTxn) txn;
QuorumVerifier qv = self.configFromString(new String(setDataTxn.getData()));
self.setLastSeenQuorumVerifier(qv, true);
}
fzk.logRequest(hdr, txn);
if (hdr != null) {
/*
* Request header is created only by the leader, so this is only set
* for quorum packets. If there is a clock drift, the latency may be
* negative. Headers use wall time, not CLOCK_MONOTONIC.
*/
long now = Time.currentWallTime();
long latency = now - hdr.getTime();
if (latency > 0) {
ServerMetrics.getMetrics().PROPOSAL_LATENCY.add(latency);
}
}
if (om != null) {
final long startTime = Time.currentElapsedTime();
om.proposalReceived(qp);
ServerMetrics.getMetrics().OM_PROPOSAL_PROCESS_TIME.add(Time.currentElapsedTime() - startTime);
}
break;
case Leader.COMMIT:
ServerMetrics.getMetrics().LEARNER_COMMIT_RECEIVED_COUNT.add(1);
fzk.commit(qp.getZxid());
if (om != null) {
final long startTime = Time.currentElapsedTime();
om.proposalCommitted(qp.getZxid());
ServerMetrics.getMetrics().OM_COMMIT_PROCESS_TIME.add(Time.currentElapsedTime() - startTime);
}
break;
case Leader.COMMITANDACTIVATE:
// get the new configuration from the request
Request request = fzk.pendingTxns.element();
SetDataTxn setDataTxn = (SetDataTxn) request.getTxn();
QuorumVerifier qv = self.configFromString(new String(setDataTxn.getData()));
// get new designated leader from (current) leader's message
ByteBuffer buffer = ByteBuffer.wrap(qp.getData());
long suggestedLeaderId = buffer.getLong();
final long zxid = qp.getZxid();
boolean majorChange =
self.processReconfig(qv, suggestedLeaderId, zxid, true);
// commit (writes the new config to ZK tree (/zookeeper/config)
fzk.commit(zxid);
if (om != null) {
om.informAndActivate(zxid, suggestedLeaderId);
}
if (majorChange) {
throw new Exception("changes proposed in reconfig");
}
break;
case Leader.UPTODATE:
LOG.error("Received an UPTODATE message after Follower started");
break;
case Leader.REVALIDATE:
if (om == null || !om.revalidateLearnerSession(qp)) {
revalidate(qp);
}
break;
case Leader.SYNC:
fzk.sync();
break;
default:
LOG.warn("Unknown packet type: {}", LearnerHandler.packetToString(qp));
break;
}
}
这里略做简要分析(有些内容需要放到以后的章节讲):
- PING:Learner回复LearnerMaster的数据包,数据包中的data字段存储的是会话信息,然后将会由LearnerMaster延长会话过期时间
- PROPOSAL:Leader发来的事务提案,然后进行事务处理(具体处理细节后面章节会专门讲解)
- COMMIT:Leader发来的提交信息,表示提案可以提交了
- COMMITANDACTIVATE:处理reconfig事务请求,将新配置提交
- UPTODATE:Leader最新的信息,但follower已经启动,所以不做处理
- REVALIDATE:客户端连接服务端时重新验证并激活会话
- SYNC:Leader、Follower进行数据同步的指令
PING
protected void ping(QuorumPacket qp) throws IOException {
// Send back the ping with our session data
ByteArrayOutputStream bos = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(bos);
Map<Long, Integer> touchTable = zk.getTouchSnapshot();
for (Entry<Long, Integer> entry : touchTable.entrySet()) {
dos.writeLong(entry.getKey());
dos.writeInt(entry.getValue());
}
qp.setData(bos.toByteArray());
writePacket(qp, true);
}
获取到会话信息然后将其发送给Leader
REVALIDATE
protected void revalidate(QuorumPacket qp) throws IOException {
ByteArrayInputStream bis = new ByteArrayInputStream(qp
.getData());
DataInputStream dis = new DataInputStream(bis);
long sessionId = dis.readLong();
boolean valid = dis.readBoolean();
ServerCnxn cnxn = pendingRevalidations.remove(sessionId);
if (cnxn == null) {
LOG.warn("Missing session 0x"
+ Long.toHexString(sessionId)
+ " for validation");
} else {
zk.finishSessionInit(cnxn, valid);
}
if (LOG.isTraceEnabled()) {
ZooTrace.logTraceMessage(LOG,
ZooTrace.SESSION_TRACE_MASK,
"Session 0x" + Long.toHexString(sessionId)
+ " is valid: " + valid);
}
}
读取sessionId以及是否激活成功的标志valid,然后获取到这个sessionId代表的客户端对应的服务端长连接处理器ServerCnxn,接下来会进行会话激活