前言
本文主要记录Zookeeper的启动过程,以及对于启动后Leader和Follower对于请求的简要处理流程。
文中所展示的源码为Zookeeper首个tag版本0.0.1。由于该版本为Zookeeper初期版本代码结构简单清晰,读起来简单明了。整体框架结构后续变化较小,故后文代码均以此版本为例。(源码可通过Github下载,切换到早期分支即可)
源码过长的部分进行了省略,主体逻辑上配合注释帮助理解。
启动过程(源码分析)
QuorumPeer作为启动Zookeeper集群服务的入口类。
com.yahoo.zookeeper.server.quorum.QuorumPeer#main
public static void main(String args[]) {
try {
// ...
// 省略对于args的参数校验
// 获取配置文件信息
File zooCfgFile = new File(args[0]);
Properties cfg = new Properties();
cfg.load(new FileInputStream(zooCfgFile));
ArrayList<QuorumServer> servers = new ArrayList<QuorumServer>();
// 通过servers数量决定启用集群模式还是单节点模式
if (servers.size() > 1) {
// 忽略部分代码
// 集群模式 初始化启动
QuorumPeer self = new QuorumPeer(servers, dataDir, dataLogDir,
clientPort, electionAlg, id, tickTime, initLimit,
syncLimit);
self.start();
self.join();
} else {
int port = clientPort;
ZooKeeperServer zk = new ZooKeeperServer(dataDir, dataLogDir, tickTime);
// 单节点模式启动
zk.startup();
NIOServerCnxn.Factory t = new NIOServerCnxn.Factory(port);
t.setZooKeeperServer(zk);
t.join();
zk.shutdown();
}
} catch (Exception e) {
ZooLog.logException(e);
}
System.exit(2);
}
对于集群模式初始化的是新建QuorumPeer对象,并执行start()
方法启动线程。
com.yahoo.zookeeper.server.quorum.QuorumPeer#QuorumPeer
public QuorumPeer(ArrayList<QuorumServer> quorumPeers, File dataDir,
File dataLogDir, int clientPort, int electionAlg, long myid,
int tickTime, int initLimit, int syncLimit) throws IOException {
super("QuorumPeer");
this.clientPort = clientPort;
// 初始化NIOServerCnxn.Factory 初始化的同时启用NIO,用于接收发送请求
this.cnxnFactory = new NIOServerCnxn.Factory(clientPort, this);
this.quorumPeers = quorumPeers;
this.dataDir = dataDir;
this.electionAlg = electionAlg;
this.dataLogDir = dataLogDir;
this.myid = myid;
this.tickTime = tickTime;
this.initLimit = initLimit;
this.syncLimit = syncLimit;
currentVote = new Vote(myid, getLastLoggedZxid());
for (QuorumServer p : quorumPeers) {
if (p.id == myid) {
myQuorumAddr = p.addr;
break;
}
}
if (myQuorumAddr == null) {
throw new SocketException("My id " + myid + " not in the peer list");
}
if (electionAlg == 0) {
udpSocket = new DatagramSocket(myQuorumAddr.getPort());
// 初始化响应Leader的线程
new ResponderThread().start();
}
}
构造方法中有两步初始化了网络交互的代码。
this.cnxnFactory = new NIOServerCnxn.Factory(clientPort, this);
开启当前zks的NIO网络请求,用于接收处理client对于server的读写请求new ResponderThread().start();
该类的主要作用是响应当前zks的leader的请求。返回xid、当前server id、leader id、leader的zid
初始化QuorumPeer后调用start()
方法,启用QuorumPeer线程(QuorumPeer继承了Thread)进入到QuorumPeer#run
方法下执行。
public void run() {
while (running) {
// 每次循环都获取当前zks的状态,并执行相应的逻辑
switch (state) {
case LOOKING:
// zks启动时为LOOKING状态,此阶段集群内各个节点参与选举,确认各自的职责是leaderServer还是followerServer
try {
ZooLog.logWarn("LOOKING");
switch (electionAlg) {
case 0:
// 主要的选举逻辑lookForLeader
currentVote = new LeaderElection(this).lookForLeader();
break;
}
} catch (Exception e) {
ZooLog.logException(e);
state = ServerState.LOOKING;
}
break;
case FOLLOWING:
// 当前zks为follower角色择进入到followLeader方法中,followLeader内部为循环阻塞执行直到出现新的选举
try {
ZooLog.logWarn("FOLLOWING");
follower = new Follower(this);
follower.followLeader();
} catch (Exception e) {
ZooLog.logException(e);
} finally {
follower.shutdown();
follower = null;
state = ServerState.LOOKING;
}
break;
case LEADING:
// 当前zks为leader角色择进入到lead方法中,lead内部为循环阻塞执行直到出现新的选举
ZooLog.logWarn("LEADING");
try {
leader = new Leader(this);
leader.lead();
leader = null;
} catch (Exception e) {
ZooLog.logException(e);
} finally {
if (leader != null) {
leader.shutdown("Forcing shutdown");
}
state = ServerState.LOOKING;
}
break;
}
}
ZooLog.logError("QuorumPeer main thread exited");
}
QuorumPeer启动后,首先会进入到lookForLeader来确认集群中哪个节点成为leader,执行后更改当前zks的state为FOLLOWING或是LEADING
LeaderElection#lookForLeader
public Vote lookForLeader() throws InterruptedException {
// getLastLoggedZxid通过zks的log文件、snapshot文件来获取Zxid
self.currentVote = new Vote(self.myid, self.getLastLoggedZxid());
// We are going to look for a leader by casting a vote for ourself
byte requestBytes[] = new byte[4];
ByteBuffer requestBuffer = ByteBuffer.wrap(requestBytes);
byte responseBytes[] = new byte[28];
ByteBuffer responseBuffer = ByteBuffer.wrap(responseBytes);
/* The current vote for the leader. Initially me! */
DatagramSocket s = null;
// 通过UDP来向集群中其他机器来发送Vote
try {
s = new DatagramSocket();
s.setSoTimeout(200);
} catch (SocketException e1) {
e1.printStackTrace();
System.exit(4);
}
DatagramPacket requestPacket = new DatagramPacket(requestBytes,
requestBytes.length);
DatagramPacket responsePacket = new DatagramPacket(responseBytes,
responseBytes.length);
HashMap<InetSocketAddress, Vote> votes = new HashMap<InetSocketAddress, Vote>(
self.quorumPeers.size());
int xid = new Random().nextInt();
while (self.running) {
votes.clear();
requestBuffer.clear();
requestBuffer.putInt(xid);
requestPacket.setLength(4);
// 发送收集集群内各个节点的Vote
for (QuorumServer server : self.quorumPeers) {
requestPacket.setSocketAddress(server.addr);
try {
s.send(requestPacket);
responsePacket.setLength(responseBytes.length);
s.receive(responsePacket);
if (responsePacket.getLength() != responseBytes.length) {
ZooLog.logError("Got a short response: "
+ responsePacket.getLength());
continue;
}
responseBuffer.clear();
int recvedXid = responseBuffer.getInt();
if (recvedXid != xid) {
ZooLog.logError("Got bad xid: expected " + xid
+ " got " + recvedXid);
continue;
}
long peerId = responseBuffer.getLong();
server.id = peerId;
Vote vote = new Vote(responseBuffer.getLong(),
responseBuffer.getLong());
InetSocketAddress addr = (InetSocketAddress) responsePacket
.getSocketAddress();
votes.put(addr, vote);
} catch (IOException e) {
// Errors are okay, since hosts may be
// down
// ZooKeeperServer.logException(e);
}
}
// 对Votes结果进行统计
ElectionResult result = countVotes(votes);
if (result.winner.id >= 0) {
self.currentVote = result.vote;
if (result.winningCount > (self.quorumPeers.size() / 2)) {
self.currentVote = result.winner;
s.close();
// 根据结果来改变当前zks的state
self.state = (self.currentVote.id == self.myid) ? ServerState.LEADING
: ServerState.FOLLOWING;
if (self.state == ServerState.FOLLOWING) {
Thread.sleep(100);
}
return self.currentVote;
}
}
Thread.sleep(1000);
}
return null;
}
如果当前zks为FOLLOWING状态时会进入到Follower#followLeader
方法内
/**
* the main method called by the follower to follow the leader
*
* @throws InterruptedException
*/
void followLeader() throws InterruptedException {
// 获取leader的addr,并尝试建立连接
InetSocketAddress addr = null;
// Find the leader by id
for (QuorumServer s : self.quorumPeers) {
if (s.id == self.currentVote.id) {
addr = s.addr;
break;
}
}
if (addr == null) {
ZooLog.logError("Couldn't find the leader with id = "
+ self.currentVote.id);
}
ZooLog.logWarn("Following " + addr);
sock = new Socket();
try {
QuorumPacket ack = new QuorumPacket(Leader.ACK, 0, null, null);
sock.setSoTimeout(self.tickTime * self.initLimit);
// 尝试最多3次建立连接,尝试之间间隔1秒
for (int tries = 0; tries < 3; tries++) {
try {
sock.connect(addr, self.tickTime * self.syncLimit);
sock.setTcpNoDelay(true);
break;
} catch (ConnectException e) {
ZooLog.logException(e);
if (tries == 2) {
throw e;
}
}
Thread.sleep(1000);
}
// 获取连接的input 和 output
leaderIs = BinaryInputArchive.getArchive(new BufferedInputStream(
sock.getInputStream()));
bufferedOutput = new BufferedOutputStream(sock.getOutputStream());
leaderOs = BinaryOutputArchive.getArchive(bufferedOutput);
// follower向leader发送本节点的Zxid
QuorumPacket qp = new QuorumPacket();
qp.setType(Leader.LASTZXID);
long sentLastZxid = self.getLastLoggedZxid();
qp.setZxid(sentLastZxid);
writePacket(qp);
// 收到leader返回的最新的Zxid
readPacket(qp);
if (qp.getType() != Leader.NEWLEADER) {
ZooLog.logError("First packet should have been NEWLEADER");
throw new IOException("First packet should have been NEWLEADER");
}
// 创建当前节点的FollowerZookeeperServer实例
zk = new FollowerZooKeeperServer(self.getId(), self.dataDir,
self.dataLogDir, this);
synchronized (zk) {
// 如果发送的Zxid与leader返回的Zxid不同,尝试从leader获取最新的数据
if (qp.getZxid() != sentLastZxid) {
// The leader is going to dump the database
zk.loadData(leaderIs);
String signature = leaderIs.readString("signature");
if (!signature.equals("BenWasHere")) {
ZooLog.logError("Missing signature. Got " + signature);
throw new IOException("Missing signature");
}
} else {
zk.loadData();
}
zk.dataTree.lastProcessedZxid = qp.getZxid();
}
// 同步数据后,向leader发送ack
ack.setZxid(qp.getZxid() & ~0xffffffffL);
writePacket(ack);
sock.setSoTimeout(self.tickTime * self.syncLimit);
// 启动当前follower节点实例
zk.startup();
// 循环处理leader各项请求
while (self.running) {
readPacket(qp);
// 处理各项leader发来的请求
switch (qp.getType()) {
// leader和follower之间的心跳检测
case Leader.PING:
// Send back the ping with our session data
ByteArrayOutputStream bos = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(bos);
HashMap<Long, Integer> touchTable = ((FollowerZooKeeperServer) zk)
.getTouchSnapshot();
for (Entry<Long, Integer> entry : touchTable.entrySet()) {
dos.writeLong(entry.getKey());
dos.writeInt(entry.getValue());
}
qp.setData(bos.toByteArray());
writePacket(qp);
break;
// 同步leader发来的mutation请求,记录日志
case Leader.PROPOSAL:
TxnHeader hdr = new TxnHeader();
BinaryInputArchive ia = BinaryInputArchive
.getArchive(new ByteArrayInputStream(qp.getData()));
Record txn = ZooKeeperServer.deserializeTxn(ia, hdr);
if (hdr.getZxid() != lastQueued + 1) {
ZooLog.logWarn("Got zxid "
+ Long.toHexString(hdr.getZxid())
+ " expected "
+ Long.toHexString(lastQueued + 1));
}
lastQueued = hdr.getZxid();
zk.logRequest(hdr, txn);
break;
// 记录leader发来的Zxid
case Leader.COMMIT:
zk.commit(qp.getZxid());
break;
// 表明leader认可当前follower为最新的数据,可以对Client端进行响应
case Leader.UPTODATE:
zk.snapshot();
self.cnxnFactory.setZooKeeperServer(zk);
break;
// 验证会话活跃性
case Leader.REVALIDATE:
ByteArrayInputStream bis = new ByteArrayInputStream(qp
.getData());
DataInputStream dis = new DataInputStream(bis);
long sessionId = dis.readLong();
boolean valid = dis.readBoolean();
synchronized (pendingRevalidations) {
ServerCnxn cnxn = pendingRevalidations
.remove(sessionId);
if (cnxn == null) {
ZooLog.logWarn("Missing "
+ Long.toHexString(sessionId)
+ " for validation");
} else {
cnxn.finishSessionInit(valid);
}
}
ZooLog.logTextTraceMessage("Session " + sessionId
+ " is valid: " + valid, ZooLog.SESSION_TRACE_MASK);
}
}
} catch (IOException e) {
try {
sock.close();
} catch (IOException e1) {
e1.printStackTrace();
}
synchronized (pendingRevalidations) {
// clear pending revalitions
pendingRevalidations.clear();
pendingRevalidations.notifyAll();
}
} finally {
self.state = ServerState.LOOKING;
}
}
如果当前zks为LEADING状态时会进入到Leader#lead
方法内
/**
* This method is main function that is called to lead
*
* @throws IOException
* @throws InterruptedException
*/
void lead() throws IOException, InterruptedException {
self.tick = 0;
// 初始化LeaderZookeeperServer实例
zk = new LeaderZooKeeperServer(self.getId(), self.dataDir,
self.dataLogDir, this);
// 加载本地数据
zk.loadData();
// 启动当前Leader节点实例
zk.startup();
long epoch = self.getLastLoggedZxid() >> 32L;
epoch++;
zk.setZxid(epoch << 32L);
zk.dataTree.lastProcessedZxid = zk.getZxid();
lastProposed = zk.getZxid();
newLeaderProposal.packet = new QuorumPacket(NEWLEADER, zk.getZxid(),
null, null);
if ((newLeaderProposal.packet.getZxid() & 0xffffffffL) != 0) {
ZooLog.logError("NEWLEADER proposal has Zxid of "
+ newLeaderProposal.packet.getZxid());
}
outstandingProposals.add(newLeaderProposal);
// 启动leader和各个follower之间交互的线程
new Thread() {
public void run() {
try {
while (true) {
Socket s = ss.accept();
s.setSoTimeout(self.tickTime * self.syncLimit);
s.setTcpNoDelay(true);
// leader和follower主要的交互逻辑,集中在该类中
new FollowerHandler(s, Leader.this);
}
} catch (Exception e) {
//
}
}
}.start();
// We have to get at least a majority of servers in sync with
// us. We do this by waiting for the NEWLEADER packet to get
// acknowledged
newLeaderProposal.ackCount++;
while (newLeaderProposal.ackCount <= self.quorumPeers.size() / 2) {
if (self.tick > self.initLimit) {
// Followers aren't syncing fast enough,
// renounce leadership!
shutdown("Waiting for " + (self.quorumPeers.size() / 2)
+ " followers, only synced with "
+ newLeaderProposal.ackCount);
if (followers.size() >= self.quorumPeers.size() / 2) {
ZooLog
.logWarn("Enough followers present. Perhaps the initTicks need to be increased.");
}
self.state = ServerState.LOOKING;
return;
}
Thread.sleep(self.tickTime);
self.tick++;
}
if (!System.getProperty("zookeeper.leaderServes", "yes").equals("no")) {
self.cnxnFactory.setZooKeeperServer(zk);
}
// Everything is a go, simply start counting the ticks
synchronized (this) {
notifyAll();
}
// We ping twice a tick, so we only update the tick every other
// iteration
boolean tickSkip = true;
// 循环处理leader和follower之间的心跳,如果低于半数follower存活则shutdown
while (true) {
Thread.sleep(self.tickTime / 2);
if (!tickSkip) {
self.tick++;
}
int syncedCount = 0;
// lock on the followers when we use it.
synchronized (followers) {
for (FollowerHandler f : followers) {
if (f.synced()) {
syncedCount++;
}
f.ping();
}
}
if (!tickSkip && syncedCount < self.quorumPeers.size() / 2) {
// Lost quorum, shutdown
shutdown("Only " + syncedCount + " followers, need "
+ (self.quorumPeers.size() / 2));
// make sure the order is the same!
// the leader goes to looking
self.state = ServerState.LOOKING;
return;
}
tickSkip = !tickSkip;
}
}
至此,以上几个类涵盖了Zookeeper启动、leader选举、follower节点追随leader节点、leader节点发起新的提案分发给follower等内容。
简要启动流程如下:
以上是Zookeeper的主要启动流程,以及主要线程功能等介绍。如有纰漏欢迎随时指出。
结语
通过阅读Zookeeper早起发布的源码,简单梳理了服务启用流程,以及服务主要的线程功能。早起版本已经具备Zookeeper的核心功能,但是代码较少逻辑清晰。直接看最新版本代码反而可能摸不着头脑。