服务端入口:org.apache.zookeeper.server.QuorumPeerMain.main
(调用ZooKeeperServerMain.main)
运行参数:配置文件
如:QuorumPeerMain /opt/zookeeper/zookeeper-3.4.3/bin/../conf/zoo.cfg
读取配置文件
QuorumPeerConfig config = newQuorumPeerConfig();
if (args.length == 1) {
config.parse(args[0]);
}
//dynamicConfigFile如果没有设置,则会以兼容模式
//将配置文件当作dynamicConfigFile重新parse
//dynamicConfiFile中可以设置服务器权重等选项
启动autopurge任务
DatadirCleanupManager purgeMgr = newDatadirCleanupManager(config
.getDataDir(), config.getDataLogDir(), config
.getSnapRetainCount(), config.getPurgeInterval());
purgeMgr.start();
//清理snapshot和log,保留最近SnapRetainCount个snapshots
//每PurgeInterval个小时运行
决定是standalone还是Distributed运行模式
if (args.length == 1&& config.isDistributed()) {
runFromConfig(config);
} else {
LOG.warn("Eitherno config or no quorum defined in config, running "
+ " in standalone mode");
//there is only server in thequorum -- run asstandalone
ZooKeeperServerMain.main(args);
}
注册log4j JMX
ManagedUtil.registerLog4jMBeans();
创建ServerCnxnFactory
ServerCnxnFactory cnxnFactory = ServerCnxnFactory.createFactory();
/**
*ServerCnxnFactory
* +NettyServerCnxnFactory
* +NIOServerCnxnFactory
* +NullServerCnxnFactory
*
*默认是NIOServerCnxnFactory
*/
创建Selector、Worker、Accept线程,ServerSocketChannel,配置登录认证
// cnxnFactory.configure(config.getClientPortAddress(),
// config.getMaxClientCnxns());
//NIOServerCnxnFactory.configure
包括如下工作:
//配置安全登录,在jass.conf中配置
configureSaslLogin();
//清理超时Session链接
cnxnExpiryQueue =
newExpiryQueue<NIOServerCnxn>(sessionlessCnxnTimeout); //二级CHash
expirerThread = newConnectionExpirerThread();
//Selector配置
int numCores= Runtime.getRuntime().availableProcessors();
// 32cores sweet spot seems to be 4 selector threads
numSelectorThreads =Integer.getInteger(
ZOOKEEPER_NIO_NUM_SELECTOR_THREADS,
Math.max((int) Math.sqrt((float)numCores/2), 1));
//Worker配置
numWorkerThreads =Integer.getInteger(
ZOOKEEPER_NIO_NUM_WORKER_THREADS, 2 *numCores);
//ServerSocketChannel
this.ss =ServerSocketChannel.open();
ss.socket().setReuseAddress(true);
LOG.info("bindingto port " + addr);
ss.socket().bind(addr);
ss.configureBlocking(false);
//AcceptThread包含一个Channel,多个Selector
acceptThread = newAcceptThread(ss, addr,selectorThreads);
创建QuorumPeer
quorumPeer = newQuorumPeer();
//QuorumPeer构造函数中对zkDB初始化
this.logFactory =newFileTxnSnapLog(dataLogDir, dataDir);
this.zkDb =newZKDatabase(this.logFactory);
//设置TxnLog,Snapshot
quorumPeer.setTxnFactory(newFileTxnSnapLog(
config.getDataLogDir(),
config.getDataDir()));
…
//zk维护的目录树结构
quorumPeer.setZKDatabase(newZKDatabase(quorumPeer.getTxnFactory()));
quorumPeer.initConfigInZKDatabase();//初始化/zookeeper/config节点
quorumPeer.setCnxnFactory(cnxnFactory);
…
quorumPeer.start();
quorumPeer.join();
启动QuorumPeer.start()
public synchronized void start() {
loadDataBase();
cnxnFactory.start();
super.start();
}
//loadDataBase 从zk的事务日志snapLog中恢复
long zxid =snapLog.restore(dataTree,sessionsWithTimeouts,playbacklistener);
//playbacklistener回调onTxnLoaded
Requestr = new Request(0,hdr.getCxid(),hdr.getType(), hdr, txn, hdr.getZxid());
addCommittedProposal(r);
//addCommittedProposal提交Proposal
QuorumPacketpp = new QuorumPacket(Leader.PROPOSAL, request.zxid,
baos.toByteArray(), null);
Proposal p = newProposal();
p.packet = pp;
p.request =request;
committedLog.add(p);
//LearnerHandler线程将消费committedLog,发送提交请求。
//cnxnFactory.start()启动NIOServerCnxnFactory所有线程
if (workerPool ==null) {
workerPool = newWorkerService(
"NIOWorker", numWorkerThreads, false);
}
for(SelectorThreadthread :selectorThreads) {
if(thread.getState() == Thread.State.NEW) {
thread.start();
}
}
//ensure thread is started once and only once
if (acceptThread.getState()== Thread.State.NEW) {
acceptThread.start();
}
if (expirerThread.getState()== Thread.State.NEW) {
expirerThread.start();
}
//startLeaderElection()开始选举过程
//投票设为投自己
if(getPeerState() == ServerState.LOOKING) {
currentVote = new Vote(myid,getLastLoggedZxid(), getCurrentEpoch());
}
//创建responder
try {
udpSocket =new DatagramSocket(myQuorumAddr.getPort());
responder = newResponderThread();
responder.start();
}
//接收xid,回复myid|leader-id | leader-zxid
//选举算法
this.electionAlg =createElectionAlgorithm(electionType);
switch(electionAlgorithm) {
case 0:
le = newLeaderElection(this); //默认
break;
case 1:
le = newAuthFastLeaderElection(this);
break;
case 2:
le = newAuthFastLeaderElection(this,true);
break;
case 3:
qcm = newQuorumCnxManager(this);
QuorumCnxManager.Listener listener = qcm.listener;
if(listener!=null){
listener.start();
le= new FastLeaderElection(this,qcm);
} else {
LOG.error("Nulllistener when initializing cnx manager");
}
break;
default:
assert false;
}
//super.start(); 在LOOKING、OBSERVING、FOLLOWING、LEADING状态之间切换
/*
* Mainloop
*/
while (running) {
switch (getPeerState()){
caseLOOKING:
…
break;
caseOBSERVING:
…
break;
caseFOLLOWING:
…
break;
caseLEADING:
…
break;
}
start_fle =System.currentTimeMillis();
}
OBSERVING状态
//observer.observeLeader();
//内部循环
QuorumPacket qp = newQuorumPacket();
while (self.isRunning()){
readPacket(qp);
processPacket(qp);
}
FOLLOWING状态
//follower.followLeader();
//内部循环
QuorumPacket qp =newQuorumPacket();
while (self.isRunning()){
readPacket(qp);
processPacket(qp);
}
LEADING状态
//leader.lead();
//从log中恢复
zk.loadData();
//接受Learner连接
//Start thread that waits for connection requests from
// newfollowers.
cnxAcceptor = newLearnerCnxAcceptor();
cnxAcceptor.setName("LearnerCnxAcceptor-" +ss.getLocalSocketAddress());
cnxAcceptor.start();
//发送NEWLEADER,并等待回复
newLeaderProposal.packet =newQuorumPacket(NEWLEADER,zk.getZxid(),
null,null);
waitForNewLeaderAck(self.getId(),zk.getZxid(), LearnerType.PARTICIPANT);
//
startZkServer();
//内部循环
while (true) {
//check we have a supportingquorum, so only
//PARTICIPANT, not OBSERVER, learners should be used
//If not, return
}
//startZKServer
//ZooKeeperServer.startup
startSessionTracker(); //关闭失效Session
for(SessionImpl s :sessionExpiryQueue.poll()){
setSessionClosing(s.sessionId);
expirer.expire(s);
}
setupRequestProcessors(); //该方法被子类重写
//上图转自淘宝技术博客:http://rdc.taobao.com/team/jm/archives/448
//Leader责任链
//LeaderZooKeeperServer.setupRequestProcessors
//对应LeaderZooKeeperServer的第一条责任链
RequestProcessor finalProcessor = newFinalRequestProcessor(this);
RequestProcessor toBeAppliedProcessor = new Leader.
ToBeAppliedRequestProcessor(finalProcessor,getLeader());
commitProcessor = newCommitProcessor(toBeAppliedProcessor,
Long.toString(getServerId()), false);
commitProcessor.start();
ProposalRequestProcessor proposalProcessor = newProposalRequestProcessor
(this,commitProcessor);
proposalProcessor.initialize();
firstProcessor = newPrepRequestProcessor(this,proposalProcessor);
((PrepRequestProcessor)firstProcessor).start();
//在调用ProposalRequestProcessor时,设置了另外一条链
// ProposalRequestProcessor(this,commitProcessor);
//
AckRequestProcessor ackProcessor = newAckRequestProcessor(zks.getLeader());
syncProcessor = newSyncRequestProcessor(zks, ackProcessor);
//PreRequestProcessor在最前, 处理各种请求
//processRequest()将来自客户端或者Follower转发的request添加到submittedRequests
//run()线程消费submittedRequests
//检查ACL,根据Sequential设置path,
checkACL(zks,parentRecord.acl, ZooDefs.Perms.CREATE,request.authInfo);
Requestrequest = submittedRequests.take();
//写请求生成一个Txn,然后交给下一个ProposalRequestProcessor
try {
switch(request.type) {
case OpCode.create:
CreateRequest createRequest = newCreateRequest();
pRequest2Txn(request.type,zks.getNextZxid(),request, createRequest,true);
break;
case OpCode.create2:
Create2Request create2Request = newCreate2Request();
pRequest2Txn(request.type,zks.getNextZxid(),request, create2Request,true);
break;
}
nextProcessor.processRequest(request);
//ProposalRequestProcessor任务
//转给CommitProcessor,SyncRequestProcessor。
//发送propose
if(requestinstanceof LearnerSyncRequest){
zks.getLeader().processSync((LearnerSyncRequest)request);
} else {
nextProcessor.processRequest(request);
if (request.getHdr()!=null) {
// Weneed to sync and get consensus on any transactions
try {
zks.getLeader().propose(request);
} catch(XidRolloverException e) {
thrownew RequestProcessorException(e.getMessage(),e);
}
syncProcessor.processRequest(request);
}
}
//propose
QuorumPacket pp = newQuorumPacket(Leader.PROPOSAL,request.zxid,
baos.toByteArray(), null);
Proposal p =new Proposal();
p.packet = pp;
p.request =request;
lastProposed = p.packet.getZxid();
outstandingProposals.put(lastProposed,p);
sendPacket(pp);
//CommitProcessor消费两个队列,queuedRequests,committedRequest
//queuedRequests保存PrepRequestProcessor线程下发的submittedRequest
//committedRequests保存Proposal通过后,LearnerHandler线程发来的提交请求
//检查queuedRequests或者committedRequests是否有内容
//如果是写,则作为pendingRequest,等待表决结果返回到committedRequest
//如果是读,则直接返回本地数据
request = queuedRequests.poll()//run
if(needCommit(request)) {
nextPending.set(request);
} else {
sendToNextProcessor(request);
}
// 等待committedRequest在commit()函数中返回
//committedRequests.add(request);
request = committedRequests.poll()
// LearnerHandler收到过半ACK消息时,调用trycommit-> commit
//见后文LearnerHander
//FinalRequest处理,FinalRequest.processRequest(request)
//写ZKDatabase
Record txn = request.getTxn();
rc = zks.processTxn(hdr,txn);
//->转给ZkDatabase
//rc = getZKDatabase().processTxn(hdr, txn);
//->在dataTree上操作
//returndataTree.processTxn(hdr,txn);
//SynRequestProcessor任务run,
//写日志,超过设置则轮转日志,建立snapshot
//将request记录到磁盘,批处理request,提高io效率
while(true ){
Request si =queuedRequests.take();
//记录到log
if (zks.getZKDatabase().append(si))
//如果logCount> (snapCount/2 + randRoll),则建以下线程
snapInProcess =new Thread("SnapshotThread") {
publicvoid run() {
try {
zks.takeSnapshot();
} catch(Exceptione) {
LOG.warn("Unexpectedexception", e);
}
};
//传递任务
nextProcessor.processRequest(si);
// AckRequestProcessor 和LearnerHandler一样会处理Follower返回的ACK响应
//发起请求的默认是ACK ?不确定
/**
* Forward therequest as an ACK to the leader
*/
leader.processAck(self.getId(),request.zxid,null);
//下面介绍前面提到的LearnerHandler
//Leader.lead同时也会创建LearnerCnxAcceptor
// Startthread that waits for connection requests from
// newfollowers.
cnxAcceptor = newLearnerCnxAcceptor();
cnxAcceptor.setName("LearnerCnxAcceptor-" +ss.getLocalSocketAddress());
cnxAcceptor.start();
//LearnerCnxAcceptor对每个连接,用LearnerHandler处理
Socket s = ss.accept();
//start with the initLimit, once theack is processed
// inLearnerHandler switch to the syncLimit
s.setSoTimeout(self.tickTime *self.initLimit);
s.setTcpNoDelay(nodelay);
LearnerHandler fh = newLearnerHandler(s, Leader.this);
fh.start();
//LearnerHandler线程
//ACK:Follower对PROPOSAL消息的响应。
//REQUEST:写请求、同步请求
while (true) {
qp = newQuorumPacket();
switch (qp.getType()) {
case Leader.ACK:
if (this.learnerType ==LearnerType.OBSERVER) {
if (LOG.isDebugEnabled()){
LOG.debug("ReceivedACK from Observer " +this.sid);
}
}
leader.processAck(this.sid,qp.getZxid(),
sock.getLocalSocketAddress());
break;
//Leader.processAck
booleanhasCommitted = tryToCommit(p,zxid, followerAddr);
//Leader.tryToCommit
//如果有过半的voter通过,则发送commit请求,添加到committedRequest队列
//getting a quorum from all necessary configurations
if(!p.hasAllQuorums()) {
return false;
}
commit(zxid);
inform(p);
//客户端请求流程
//找出哪里调用了第一个processRequest
//ZooKeeperServer.submitRequest
firstProcessor.processRequest(si);//PrepRequestProcessor
//<- ZooKeeperServer.createSession
// Request si = newRequest(cnxn, sessionId, xid, type, bb, authInfo);
// submitRequest(si);
submitRequest(cnxn, sessionId, OpCode.createSession, 0,to,null);
//<- ZooKeeperServer.processConnectRequest
createSession(cnxn, passwd,sessionTimeout);
//<- NIOServerCnxn.readConnectRequest
zkServer.processConnectRequest(this,incomingBuffer);
//Follower责任链
//FollowerZooKeeperServer.setupRequestProcessors
RequestProcessor finalProcessor = newFinalRequestProcessor(this);
commitProcessor = newCommitProcessor(finalProcessor,
Long.toString(getServerId()), true);
commitProcessor.start();
firstProcessor = newFollowerRequestProcessor(this,commitProcessor);
((FollowerRequestProcessor) firstProcessor).start();
syncProcessor = newSyncRequestProcessor(this,
newSendAckRequestProcessor((Learner)getFollower()));
syncProcessor.start();
// 两条线
// 触发第一条线的仍是基类 ZooKeeperServer.submitRequest
// Follower继承自Learner,与Leader建立了socket连接
// Follower.followLeader处理的消息
PING:返回PING给Leader
PROPOSAL:放入pendingTxns队列,转发给SyncRequestProcessor线程
COMMIT:比较和pendingTxns队首zxid是否相同,相同交给commitProcessor/退出
UPTODATE:同步后,Leader发送此消息,表示follower可以提供服务了
SYNC:返回SYNC结果到客户端,对应于Paxos中的慢速读
while (self.isRunning()){
readPacket(qp);
processPacket(qp);
}
//Leader.PROPOSAL:
// fzk.logRequest(hdr,txn);
Requestrequest = new Request(hdr.getClientId(),
hdr.getCxid(),hdr.getType(), hdr, txn, hdr.getZxid());
if((request.zxid & 0xffffffffL) != 0) {
pendingTxns.add(request);
}
syncProcessor.processRequest(request);
// syncProcessor后接 SendAckRequestProcessor
// SendAckRequestProcessor
//发送ACK给Leader
QuorumPacket qp= new QuorumPacket(Leader.ACK,si.getHdr().getZxid(),null,nul
learner.writePacket(qp,false);
//FollowerRequestProcessor 添加到queuedRequest交给run线程处理
if (!finished) {
queuedRequests.add(request);
}
//run消费queueRequest
// Wewant to queue the request to be processed before we submit
// therequest to the leader so that we are ready to receive the response
nextProcessor.processRequest(request);//CommitRequestProcessor
//发现如果是写请求,则发送REQUEST消息给Leader
zks.getFollower().request(request);
//CommitProcessor同Leader
//如果是写,则作为pendingRequest,等待表决结果返回到committedRequest
//FinalRequestProcessor 同Leader,更新zkDatabase
Follower提交请求,更新log,回复ACK,收到COMMIT后更新zkDatabase
//Observer责任链
RequestProcessor finalProcessor = newFinalRequestProcessor(this);
commitProcessor = newCommitProcessor(finalProcessor,
Long.toString(getServerId()), true);
commitProcessor.start();
firstProcessor = new ObserverRequestProcessor(this,commitProcessor);
((ObserverRequestProcessor) firstProcessor).start();
syncProcessor = newSyncRequestProcessor(this,
newSendAckRequestProcessor(getObserver()));
syncProcessor.start();
//行为几乎和Follower一样,只是不参与投票(Observer.processPacket忽略消息)
//忽略Leader.PROPOSALLeader.COMMIT
Zookeeper最特别的一点是,Leader在发送PROPOSAL消息之前,和Follower接收到PROPOSAL消息之后,都会立即将消息记录到日志中。这样在收到过半的ACK之后,既可以确认消息已经在过半的server中保存过了。即使之后的Commit消息发送失败,也在事实上通过了消息。丢失commit消息的follower会在下一个事务中发现这一点,并自动退出。通过重启来重新取得一致性。