Zookeeper服务器启动源码
Zookeeper服务器启动的main入口类是 QuorumPeerMain 。它作为服务器单机或集群的启动类,mian函数为入口。
public static void main(String[] args) {
QuorumPeerMain main = new QuorumPeerMain();
try {
main.initializeAndRun(args);
} catch (IllegalArgumentException e) {
LOG.error("Invalid arguments, exiting abnormally", e);
LOG.info(USAGE);
System.err.println(USAGE);
ZKAuditProvider.addServerStartFailureAuditLog();
ServiceUtils.requestSystemExit(ExitCode.INVALID_INVOCATION.getValue());
} catch (ConfigException e) {
LOG.error("Invalid config, exiting abnormally", e);
System.err.println("Invalid config, exiting abnormally");
ZKAuditProvider.addServerStartFailureAuditLog();
ServiceUtils.requestSystemExit(ExitCode.INVALID_INVOCATION.getValue());
} catch (DatadirException e) {
LOG.error("Unable to access datadir, exiting abnormally", e);
System.err.println("Unable to access datadir, exiting abnormally");
ZKAuditProvider.addServerStartFailureAuditLog();
ServiceUtils.requestSystemExit(ExitCode.UNABLE_TO_ACCESS_DATADIR.getValue());
} catch (AdminServerException e) {
LOG.error("Unable to start AdminServer, exiting abnormally", e);
System.err.println("Unable to start AdminServer, exiting abnormally");
ZKAuditProvider.addServerStartFailureAuditLog();
ServiceUtils.requestSystemExit(ExitCode.ERROR_STARTING_ADMIN_SERVER.getValue());
} catch (Exception e) {
LOG.error("Unexpected exception, exiting abnormally", e);
ZKAuditProvider.addServerStartFailureAuditLog();
ServiceUtils.requestSystemExit(ExitCode.UNEXPECTED_ERROR.getValue());
}
LOG.info("Exiting normally");
ServiceUtils.requestSystemExit(ExitCode.EXECUTION_FINISHED.getValue());
}
可以看到在main入口里面,唯一的代码走向就是运行initializeAndRun方法。
protected void initializeAndRun(String[] args) throws ConfigException, IOException, AdminServerException {
QuorumPeerConfig config = new QuorumPeerConfig();
if (args.length == 1) {
config.parse(args[0]);//解析QuorumPeerConfig
}
// Start and schedule the the purge task
//启动并计划清除任务,对事务日志和快照数据文件进行定时清理。
DatadirCleanupManager purgeMgr = new DatadirCleanupManager(
config.getDataDir(),
config.getDataLogDir(),
config.getSnapRetainCount(),//保留快照数量,,打完了几个快照后,前面的快照就不需要了,默认保留三个
config.getPurgeInterval());
purgeMgr.start();
if (args.length == 1 && config.isDistributed()) {//集群模式运行
runFromConfig(config);
} else {//如果是单机,则委托给ZookeeperServerMain进行启动
LOG.warn("Either no config or no quorum defined in config, running in standalone mode");
// there is only server in the quorum -- run as standalone
ZooKeeperServerMain.main(args);
}
}
-
可以看我写在代码中的注释此方法会new出一个QuorumPeerConfig类的对象,看名字就可以猜到是服务器的一个配置类,它的属性名称和值对应着配置文件里各个属性的配置。然后会通过此对象的parse方法,将Zoo.cfg文件的属性解析到QuorumPeerConfig对象中。
-
然后会new一个DatadirCleanupManager对象。此类是对事务日志和快照文件的定时清理。比如Zookeeper打快照文件,打快照堆积太多了,占空间大小,而且快照会逐个替代之前的数据,所以会定期清理快照和事务日志数据
-
下面的if判断,判断的是用单机模式启动,还是用集群。
public boolean isDistributed() { //standaloneEnabled=false,表示集群模式 或者参与者大于1 return quorumVerifier != null && (!standaloneEnabled || quorumVerifier.getVotingMembers().size() > 1); }
这是isDistributed方法的代码。上文说了,QuorumPeerConfig的config对象会将文件里的属性解析到类属性上,quorumVerifier是集群验证器,主要完成判断一组server在已给定的配置的server列表中,是否能够构成集群。而standaloneEnabled属性就是代表是否是集群模式,false表示为集群模式,或者集群验证器里面服务器节点数量大于1,代表是集群启动
-
如果是集群服务器启动,则走runFromConfig方法
-
如果是单机服务器启动,则会走ZooKeeperServerMain类的main方法入口
集群服务器启动源码
- 上文说了,集群启动,会走到runFromConfig方法。
public void runFromConfig(QuorumPeerConfig config) throws IOException, AdminServerException {
try {
ManagedUtil.registerLog4jMBeans();
} catch (JMException e) {
LOG.warn("Unable to register log4j JMX control", e);
}
LOG.info("Starting quorum peer");
MetricsProvider metricsProvider;
try {
metricsProvider = MetricsProviderBootstrap.startMetricsProvider(
config.getMetricsProviderClassName(),
config.getMetricsProviderConfiguration());
} catch (MetricsProviderLifeCycleException error) {
throw new IOException("Cannot boot MetricsProvider " + config.getMetricsProviderClassName(), error);
}
try {
ServerMetrics.metricsProviderInitialized(metricsProvider);
ServerCnxnFactory cnxnFactory = null;
ServerCnxnFactory secureCnxnFactory = null;
//创建初始化ServerCnxnFactory
if (config.getClientPortAddress() != null) {
cnxnFactory = ServerCnxnFactory.createFactory();
cnxnFactory.configure(config.getClientPortAddress(), config.getMaxClientCnxns(), config.getClientPortListenBacklog(), false);
}
if (config.getSecureClientPortAddress() != null) {
secureCnxnFactory = ServerCnxnFactory.createFactory();
secureCnxnFactory.configure(config.getSecureClientPortAddress(), config.getMaxClientCnxns(), config.getClientPortListenBacklog(), true);
}
//创建QuorumPeer实例,下面进行初始化
quorumPeer = getQuorumPeer();
//创建FileTxnSnapLog
quorumPeer.setTxnFactory(new FileTxnSnapLog(config.getDataLogDir(), config.getDataDir()));
quorumPeer.enableLocalSessions(config.areLocalSessionsEnabled());
quorumPeer.enableLocalSessionsUpgrading(config.isLocalSessionsUpgradingEnabled());
//quorumPeer.setQuorumPeers(config.getAllMembers());
//选举策略
quorumPeer.setElectionType(config.getElectionAlg());
//设置myid
quorumPeer.setMyid(config.getServerId());
quorumPeer.setTickTime(config.getTickTime());
quorumPeer.setMinSessionTimeout(config.getMinSessionTimeout());
quorumPeer.setMaxSessionTimeout(config.getMaxSessionTimeout());
quorumPeer.setInitLimit(config.getInitLimit());
quorumPeer.setSyncLimit(config.getSyncLimit());
quorumPeer.setConnectToLearnerMasterLimit(config.getConnectToLearnerMasterLimit());
quorumPeer.setObserverMasterPort(config.getObserverMasterPort());
quorumPeer.setConfigFileName(config.getConfigFilename());
quorumPeer.setClientPortListenBacklog(config.getClientPortListenBacklog());
//创建ZKDatabase
quorumPeer.setZKDatabase(new ZKDatabase(quorumPeer.getTxnFactory()));
quorumPeer.setQuorumVerifier(config.getQuorumVerifier(), false);
if (config.getLastSeenQuorumVerifier() != null) {
quorumPeer.setLastSeenQuorumVerifier(config.getLastSeenQuorumVerifier(), false);
}
quorumPeer.initConfigInZKDatabase();
quorumPeer.setCnxnFactory(cnxnFactory);
quorumPeer.setSecureCnxnFactory(secureCnxnFactory);
quorumPeer.setSslQuorum(config.isSslQuorum());
quorumPeer.setUsePortUnification(config.shouldUsePortUnification());
quorumPeer.setLearnerType(config.getPeerType());
quorumPeer.setSyncEnabled(config.getSyncEnabled());
quorumPeer.setQuorumListenOnAllIPs(config.getQuorumListenOnAllIPs());
if (config.sslQuorumReloadCertFiles) {
quorumPeer.getX509Util().enableCertFileReloading();
}
quorumPeer.setMultiAddressEnabled(config.isMultiAddressEnabled());
quorumPeer.setMultiAddressReachabilityCheckEnabled(config.isMultiAddressReachabilityCheckEnabled());
quorumPeer.setMultiAddressReachabilityCheckTimeoutMs(config.getMultiAddressReachabilityCheckTimeoutMs());
// sets quorum sasl authentication configurations
quorumPeer.setQuorumSaslEnabled(config.quorumEnableSasl);
if (quorumPeer.isQuorumSaslAuthEnabled()) {
quorumPeer.setQuorumServerSaslRequired(config.quorumServerRequireSasl);
quorumPeer.setQuorumLearnerSaslRequired(config.quorumLearnerRequireSasl);
quorumPeer.setQuorumServicePrincipal(config.quorumServicePrincipal);
quorumPeer.setQuorumServerLoginContext(config.quorumServerLoginContext);
quorumPeer.setQuorumLearnerLoginContext(config.quorumLearnerLoginContext);
}
quorumPeer.setQuorumCnxnThreadsSize(config.quorumCnxnThreadsSize);
//这个初始化没有做什么事情
quorumPeer.initialize();
if (config.jvmPauseMonitorToRun) {
quorumPeer.setJvmPauseMonitor(new JvmPauseMonitor(config));
}
//启动,quorumpeer是一个线程
quorumPeer.start();
ZKAuditProvider.addZKStartStopAuditLog();
//等待quorumPeer线程执行结束
quorumPeer.join();
} catch (InterruptedException e) {
// warn, but generally this is ok
LOG.warn("Quorum Peer interrupted", e);
} finally {
if (metricsProvider != null) {
try {
metricsProvider.stop();
} catch (Throwable error) {
LOG.warn("Error while stopping metrics", error);
}
}
}
}
看源码可以发现,在此方法里面,实例并初始化了ServerCnxnFactory对象,它是一个接口,此接口对象是服务器端通信上下文的工厂,后面写文章对源码进行解析。
然后会new一个QuorumPeer对象,后面对此对象进行初始化,属性进行赋值。可以看我写在源码中的注释,有几个重点的属性赋值。比如说
//创建FileTxnSnapLog
quorumPeer.setTxnFactory(new FileTxnSnapLog(config.getDataLogDir(), config.getDataDir()));
FileTxnSnapLog对象会在后面文章对源码进行解析。此类包含了TxnLog和SnapShot接口,即对FileTxnSnapLog的很多操作都会转发给TxnLog和SnapLog进行操作,这是一种典型的组合方法。TxnLog是事务日志处理类的接口,SnapLog是快照处理类的接口。和Redis类似,持久化既有快照处理,也有事务日志处理。
可以看到FileTxnSnapLog类的构造器里面,有两个参数,都是从config,也就是文件解析得到的。猜也可以猜到是事务日志文件和快照文件。看源码可以看到是通过上文解析配置文件得到这两个文件目录,再通过参数传进FileTxnSnapLog,使其能对日志和快照文件进行操作。
public File getDataLogDir() {
return dataLogDir;
}
public File getDataDir() {
return dataDir;
}
protected File dataDir;//快照日志目录
protected File dataLogDir;//事务日志目录
//设置myid
quorumPeer.setMyid(config.getServerId());根据配置文件解析得到此集群节点的id号
//创建ZKDatabase
quorumPeer.setZKDatabase(new ZKDatabase(quorumPeer.getTxnFactory()));
ZKDatabase是zk服务器的数据库,里面有存放了DataTree和sessionsWithTimeouts。这个会再后面文章中进行解析。
- 然后会执行QuorumPeer的start方法
public synchronized void start() {
//集群模式下,服务启动步骤
//1.加载DataBase
//2.启动NIOServerCnxnFactory
//3.启动adminServer
//4.初始化领导者选举策略
//5.调用本类的run方法,开始进行领导者选举,并开始处理客户端请求
if (!getView().containsKey(myid)) {
throw new RuntimeException("My id " + myid + " not in the peer list");
}
//初始化DataTree
loadDataBase();//从事务日志目录dataLogDir和数据快照目录dataDir中恢复出DataTree数据
//启动AcceptorThread SelectorThread
startServerCnxnFactory();//开启对客户端的连接端口,启动ServerCnxnFactory主线程
try {
adminServer.start();
} catch (AdminServerException e) {
LOG.warn("Problem starting AdminServer", e);
System.out.println(e);
}
//初始化领导者选举,包括两个应用层面的线程和队列,也包括传输层的线程和队列
startLeaderElection();//创建出选举算法
startJvmPauseMonitor();
super.start();//启动QuorumPeer线程,在该线程中进行服务器状态的检查
}
- 首先是初始化DataTree,也就是loadDataBase。再此方法里,主要是调用了zkDataBase的loadDataBase方法,最终是调用了FileTxnSnapLog对象的restore方法,
将磁盘上的数据库加载到内存中,并将事务添加到内存中的commitlog中,返回磁盘上的最后一个有效zxidlong zxid = snapLog.restore(dataTree, sessionsWithTimeouts, commitProposalPlaybackListener);
public long restore(DataTree dt, Map<Long, Integer> sessions, PlayBackListener listener) throws IOException { long snapLoadingStartTime = Time.currentElapsedTime(); //根据snap文件反序列化dt和sessions long deserializeResult = snapLog.deserialize(dt, sessions); ServerMetrics.getMetrics().STARTUP_SNAP_LOAD_TIME.add(Time.currentElapsedTime() - snapLoadingStartTime); FileTxnLog txnLog = new FileTxnLog(dataDir); boolean trustEmptyDB; File initFile = new File(dataDir.getParent(), "initialize"); if (Files.deleteIfExists(initFile.toPath())) { LOG.info("Initialize file found, an empty database will not block voting participation"); trustEmptyDB = true; } else { trustEmptyDB = autoCreateDB; } RestoreFinalizer finalizer = () -> { long highestZxid = fastForwardFromEdits(dt, sessions, listener); // The snapshotZxidDigest will reset after replaying the txn of the zxid in the snapshotZxidDigest, if it's not reset to null after // restoring, it means either there are not enough txns to cover that // zxid or that txn is missing /** * 在重播snapshotZxidDigest中的zxid的txn之后,snapshotZxidDigest将重置,如果还原后//仍未将其重置为null,则意味着txns不足以覆盖// zxid或txn丢失 */ DataTree.ZxidDigest snapshotZxidDigest = dt.getDigestFromLoadedSnapshot(); if (snapshotZxidDigest != null) { LOG.warn( "Highest txn zxid 0x{} is not covering the snapshot digest zxid 0x{}, " + "which might lead to inconsistent state", Long.toHexString(highestZxid), Long.toHexString(snapshotZxidDigest.getZxid())); } return highestZxid; }; if (-1L == deserializeResult) { /* this means that we couldn't find any snapshot, so we need to * initialize an empty database (reported in ZOOKEEPER-2325) */ if (txnLog.getLastLoggedZxid() != -1) { // ZOOKEEPER-3056: provides an escape hatch for users upgrading // from old versions of zookeeper (3.4.x, pre 3.5.3). if (!trustEmptySnapshot) { throw new IOException(EMPTY_SNAPSHOT_WARNING + "Something is broken!"); } else { LOG.warn("{}This should only be allowed during upgrading.", EMPTY_SNAPSHOT_WARNING); return finalizer.run(); } } if (trustEmptyDB) { /* TODO: (br33d) we should either put a ConcurrentHashMap on restore() * or use Map on save() */ save(dt, (ConcurrentHashMap<Long, Integer>) sessions, false); /* return a zxid of 0, since we know the database is empty */ return 0L; } else { /* return a zxid of -1, since we are possibly missing data */ LOG.warn("Unexpected empty data tree, setting zxid to -1"); dt.lastProcessedZxid = -1L; return -1L; } } return finalizer.run(); }
可以看到先是通过snapshot文件反序列化datatree和sessions,再获取比snapshot文件中的zxid+1大的log文件的迭代器,以对log文件中的事务进行迭代,迭代log文件的每个事务,并且将该事务应用在datatree中,最后关闭迭代器,返回log文件中最后一个事务的zxid(作为最高的zxid)
- 然后是startServerCnxnFactory()
private void startServerCnxnFactory() { if (cnxnFactory != null) { cnxnFactory.start(); } if (secureCnxnFactory != null) { secureCnxnFactory.start(); } }
下面的secureCnxnFactory没有仔细去了解过,猜测是确保ServerCnxnFactory的正常启动。ServerCnxnFactory是一个接口,实现的子类有NIOServerCnxnFactory和NettyServerCnxnFactory,也可以看出,zk既可以通过NIO进行通信处理,也可以使用Netty,不过默认是NIO。看看NIOServerCnxnFactory的start()
public void start() { stopped = false; if (workerPool == null) { workerPool = new WorkerService("NIOWorker", numWorkerThreads, false); } for (SelectorThread thread : selectorThreads) { if (thread.getState() == Thread.State.NEW) { thread.start(); } } // ensure thread is started once and only once if (acceptThread.getState() == Thread.State.NEW) { acceptThread.start(); } if (expirerThread.getState() == Thread.State.NEW) { expirerThread.start(); } }
workerPool是一个线程池,SelectorThread和AcceptThread还有ExpirerThread都继承了Thread。可以看到workerPool是先创建了,还没有进行线程池的启动,所以先关心下面三个线程的启动,需要看他们的run方法。而且,看这个for循环,遍历这个selectorThreads,就会想。这个selectorThreads是从哪里来的,于是一步步找,会找到再一个configure方法里,会向这个selectorThreads 的set里加入SelectorThread,然后再向上找,就可以发现,是在QuorumPeerMain类里quorumPeer对象初始化之前,会初始化ServerCnxnFactory
if (config.getClientPortAddress() != null) { cnxnFactory = ServerCnxnFactory.createFactory(); cnxnFactory.configure(config.getClientPortAddress(), config.getMaxClientCnxns(), config.getClientPortListenBacklog(), false); } public static ServerCnxnFactory createFactory() throws IOException { String serverCnxnFactoryName = System.getProperty(ZOOKEEPER_SERVER_CNXN_FACTORY); if (serverCnxnFactoryName == null) { //默认通信是用NIO serverCnxnFactoryName = NIOServerCnxnFactory.class.getName(); } try { ServerCnxnFactory serverCnxnFactory = (ServerCnxnFactory) Class.forName(serverCnxnFactoryName) .getDeclaredConstructor() .newInstance(); LOG.info("Using {} as server connection factory", serverCnxnFactoryName); return serverCnxnFactory; } catch (Exception e) { IOException ioe = new IOException("Couldn't instantiate " + serverCnxnFactoryName, e); throw ioe; } }
可以看到调用了createFactory,里面也证明了我之前说的,默认通信是用NIO。而下一个就调用了configure方法,configure方法主要是初始化thread,完成socket相关配置。所以上面能够启动各个thread,原因是在服务器启动的时候,对这些Thread进行了创建和初始化。
//32核的最佳选择似乎是4个选择器线程
numSelectorThreads = Integer.getInteger(
ZOOKEEPER_NIO_NUM_SELECTOR_THREADS,
Math.max((int) Math.sqrt((float) numCores / 2), 1));
for (int i = 0; i < numSelectorThreads; ++i) {
selectorThreads.add(new SelectorThread(i));
}
看configure()方法里的这些代码,可以看到SelectorThread的数量是根据CPU核数来确定的。
而SelectorThread和AcceptThread是干什么的,这个了解过NIO的就可以猜到,AcceptThread就是用来与客户端进行连接的,接收各个客户端发过来进行连接的SelectionKey。AcceptThread在服务器里只有一个,而SelectorThread上面看到是有多个的,怎么说呢,就是:
AcceptThread new出来的socketchannel,是交给了SelectorThread里面的acceptedQueue队列 。也就是说,此服务器通过AcceptThread与客户端进行IO连接的socketChannel交给了SelectorThread,让 SelectorThread来处理读或写事件。反正是NIO的东西吧,这样就起到了一个异步的效果,并发更高了。
可以看看AcceptThread里,run方法里面的doAccept方法的源码
private boolean doAccept() {
boolean accepted = false;
SocketChannel sc = null;
try {
sc = acceptSocket.accept();
accepted = true;
if (limitTotalNumberOfCnxns()) {
throw new IOException("Too many connections max allowed is " + maxCnxns);
}
InetAddress ia = sc.socket().getInetAddress();
int cnxncount = getClientCnxnCount(ia);
if (maxClientCnxns > 0 && cnxncount >= maxClientCnxns) {
throw new IOException("Too many connections from " + ia + " - max is " + maxClientCnxns);
}
LOG.debug("Accepted socket connection from {}", sc.socket().getRemoteSocketAddress());
sc.configureBlocking(false);
// Round-robin assign this connection to a selector thread
//循环将此连接分配给选择器线程
if (!selectorIterator.hasNext()) {
selectorIterator = selectorThreads.iterator();
}
SelectorThread selectorThread = selectorIterator.next();
//selectorThread与此socketChannel连接,让acceptthread生成的sc与selectorThread连接,达到异步的效果,acceptthread只管接受,请求交给selectorThread处理
if (!selectorThread.addAcceptedConnection(sc)) {
throw new IOException("Unable to add connection to selector queue"
+ (stopped ? " (shutdown in progress)" : ""));
}
acceptErrorLogger.flush();
} catch (IOException e) {
// accept, maxClientCnxns, configureBlocking
ServerMetrics.getMetrics().CONNECTION_REJECTED.add(1);
acceptErrorLogger.rateLimitLog("Error accepting new connection: " + e.getMessage());
fastCloseSock(sc);
}
return accepted;
}
就可以很明显的看到AcceptThread接受新的套接字连接,循环分配给选择器线程进行处理,返回是否从接受队列中拉出连接。
接下来再看SelectorThread的run方法
public void run() {
try {
while (!stopped) {//while循环
try {
//查询就绪事件
//事件要先注册
select();//要注册监听的是读事件还是写事件
//这个方法会对acceptedQueue队列中的sc向select上注册OP_READ事件
processAcceptedConnections();
processInterestOpsUpdateRequests();
} catch (RuntimeException e) {
LOG.warn("Ignoring unexpected runtime exception", e);
} catch (Exception e) {
LOG.warn("Ignoring unexpected exception", e);
}
}
// 关闭仍在选择器上挂起的连接。其他正在进行中的工作,请让他们从工作队列中抽出。
for (SelectionKey key : selector.keys()) {
NIOServerCnxn cnxn = (NIOServerCnxn) key.attachment();
if (cnxn.isSelectable()) {
cnxn.close(ServerCnxn.DisconnectReason.SERVER_SHUTDOWN);
}
cleanupSelectionKey(key);
}
SocketChannel accepted;
while ((accepted = acceptedQueue.poll()) != null) {
fastCloseSock(accepted);
}
updateQueue.clear();
} finally {
closeSelector();
// This will wake up the accept thread and the other selector
// threads, and tell the worker thread pool to begin shutdown.
NIOServerCnxnFactory.this.stop();
LOG.info("selector thread exitted run method");
}
}
可以看看我写在里面的注释
再看select()
private void select() {
try {
//如果没有注册读或写事件,NIO的select()方法是不会阻塞的
selector.select();
Set<SelectionKey> selected = selector.selectedKeys();
ArrayList<SelectionKey> selectedList = new ArrayList<SelectionKey>(selected);
Collections.shuffle(selectedList);//随机打乱
Iterator<SelectionKey> selectedKeys = selectedList.iterator();
while (!stopped && selectedKeys.hasNext()) {
SelectionKey key = selectedKeys.next();
selected.remove(key);
if (!key.isValid()) {
cleanupSelectionKey(key);
continue;
}
//如果可读或者可写
if (key.isReadable() || key.isWritable()) {
//处理客户端的命令、请求
handleIO(key);//key表示一次就绪事件
} else {
LOG.warn("Unexpected ops in select {}", key.readyOps());
}
}
} catch (IOException e) {
LOG.warn("Ignoring IOException while selecting", e);
}
}
可以看到它会查看有没有AcceptThread传过来selectionKey,如果有的话,就通过handleIO方法处理此selectionKey。这时也可以理解NIO嘛,AcceptThread用来连接,SelectorThread用来处理命令,是不是呢,看handleIO源码
private void handleIO(SelectionKey key) {
//把当前要处理的key封装为一个IOworkRequest对象,等下交给workerPool进行调度处理
IOWorkRequest workRequest = new IOWorkRequest(this, key);
NIOServerCnxn cnxn = (NIOServerCnxn) key.attachment();
// Stop selecting this key while processing on its
// connection
cnxn.disableSelectable();
key.interestOps(0);
touchCnxn(cnxn);
//将IOWorkRequest交给workerpool线程池处理
workerPool.schedule(workRequest);
}
可以看到,出现了在上面初始化的workerpool线程池,所以selectorThread是后面交给线程池来处理这个selectionkey的。
再来看这个线程池是干嘛。可以看它的execute是处理了个叫scheduledWorkRequest的东西。看源码就可以看出其实这个东西是由handleIO方法里的IOworkRequest封装的,
看IOWorkRequest的doWork()
public void doWork() throws InterruptedException {
if (!key.isValid()) {
selectorThread.cleanupSelectionKey(key);
return;
}
if (key.isReadable() || key.isWritable()) {
//才是真正的进行IO处理的逻辑
cnxn.doIO(key);
// Check if we shutdown or doIO() closed this connection
if (stopped) {
cnxn.close(ServerCnxn.DisconnectReason.SERVER_SHUTDOWN);
return;
}
if (!key.isValid()) {
selectorThread.cleanupSelectionKey(key);
return;
}
touchCnxn(cnxn);
}
// Mark this connection as once again ready for selection
cnxn.enableSelectable();
// Push an update request on the queue to resume selecting
// on the current set of interest ops, which may have changed
// as a result of the I/O operations we just performed.
if (!selectorThread.addInterestOpsUpdateRequest(key)) {
cnxn.close(ServerCnxn.DisconnectReason.CONNECTION_MODE_CHANGED);
}
}
里面的doIO才是真正进行IO处理这个key,看看代码
void doIO(SelectionKey k) throws InterruptedException {
try {
if (!isSocketOpen()) {
LOG.warn("trying to do i/o on a null socket for session: 0x{}", Long.toHexString(sessionId));
return;
}
//判断是否可读
if (k.isReadable()) {
int rc = sock.read(incomingBuffer);//一开始读4个字节数据,也就是数据包的长度
if (rc < 0) {
handleFailedRead();
}
if (incomingBuffer.remaining() == 0) {
boolean isPayload;
if (incomingBuffer == lenBuffer) { // 下一个请求的开始
incomingBuffer.flip();
//读取lenBuffer的前四个字节,当读取的是内容长度时则为true,否则为false
isPayload = readLength(k);//读取len,判断是否是payload
incomingBuffer.clear();
} else {//如果不是下一个请求,就是代表读的是内容
// continuation
isPayload = true;
}
if (isPayload) { // 不为四个字节,为实际内容
//读取内容
readPayload();
} else {
// four letter words take care
// need not do anything else
return;
}
}
}
if (k.isWritable()) {//如果可写
//比如有closesession
handleWrite(k);
if (!initialized && !getReadInterest() && !getWriteInterest()) {
throw new CloseRequestException("responded to info probe", DisconnectReason.INFO_PROBE);
}
}
} catch (CancelledKeyException e) {
LOG.warn("CancelledKeyException causing close of session: 0x{}", Long.toHexString(sessionId));
LOG.debug("CancelledKeyException stack trace", e);
close(DisconnectReason.CANCELLED_KEY_EXCEPTION);
} catch (CloseRequestException e) {//捕获到了handlewrite里面的closesession
// expecting close to log session closure
//移除watcher,并关闭socket
close();
} catch (EndOfStreamException e) {
LOG.warn("Unexpected exception", e);
// expecting close to log session closure
close(e.getReason());
} catch (ClientCnxnLimitException e) {
// Common case exception, print at debug level
ServerMetrics.getMetrics().CONNECTION_REJECTED.add(1);
LOG.warn("Closing session 0x{}", Long.toHexString(sessionId), e);
close(DisconnectReason.CLIENT_CNX_LIMIT);
} catch (IOException e) {
LOG.warn("Close of session 0x{}", Long.toHexString(sessionId), e);
close(DisconnectReason.IO_EXCEPTION);
}
}
这里是解决了一个沾包粘包的问题,因为客户端将packet数据包发送过来的时候,会在前4个字节里面说明此数据包的长度大小,让服务器接收的时候,可能方便的分配缓冲区,
private final ByteBuffer lenBuffer = ByteBuffer.allocate(4);// 分配四个字节缓冲区,用于读取len长度,空间大小绝对不会变
private ByteBuffer incomingBuffer = lenBuffer;//读取输入流,会根据读取到的len再分配对应的长度
可以看到incomingBuffer是四个字节大小。而此时rc就是读到的数据包正常的大小,假设为100个字节,此时rc==100.然后再看
private boolean readLength(SelectionKey k) throws IOException {
// 读取长度,现在获取缓冲区
int len = lenBuffer.getInt();
if (!initialized && checkFourLetterWord(sk, len)) {//如果没有初始化,并且是cmd的话,就写对应的printWriter回复
return false;
}
if (len < 0 || len > BinaryInputArchive.maxBuffer) {
throw new IOException("Len error " + len);
}
if (!isZKServerRunning()) {
throw new IOException("ZooKeeperServer not running");
}
// checkRequestSize will throw IOException if request is rejected
zkServer.checkRequestSizeWhenReceivingMessage(len);
incomingBuffer = ByteBuffer.allocate(len);//分配对应len的空间
return true;
}
可以看到len就是rc的长度100,然后再下面的给incomingBuffer重新赋值,分配100字节大小的空间。然后再下面的readPayload()方法就是读取真正数据报内容
private void readPayload() throws IOException, InterruptedException, ClientCnxnLimitException {
if (incomingBuffer.remaining() != 0) { // have we read length bytes?
int rc = sock.read(incomingBuffer); // sock is non-blocking, so ok
if (rc < 0) {
handleFailedRead();
}
}
//如果incomingBuffer已经满了,则表示packet的数据都已经读到incomingBuffer中了,则进行处理
if (incomingBuffer.remaining() == 0) { // have we read length bytes?
incomingBuffer.flip();
packetReceived(4 + incomingBuffer.remaining());//更新统计数据
if (!initialized) {
//socket连接建立好了还没有初始化,处理ConnectRequest
//,里面会将initialized设置为true,证明连接初始化好了
readConnectRequest();//读取连接请求
} else {
//处理其他命令请求
readRequest();
}
lenBuffer.clear();
incomingBuffer = lenBuffer;//还原成只读4个字节的byte
}
}
如果socket还没有初始化,就会先进行readConnectRequest()方法
如果初始化好了,就处理其他命令
然后下面incomingBuffer会重新赋值为4个字节,读取下个数据包的大小。
里面
zkServer.processConnectRequest(this, incomingBuffer);
public void processConnectRequest(ServerCnxn cnxn, ByteBuffer incomingBuffer)
throws IOException, ClientCnxnLimitException {
BinaryInputArchive bia = BinaryInputArchive.getArchive(new ByteBufferInputStream(incomingBuffer));
ConnectRequest connReq = new ConnectRequest();
//反序列化
connReq.deserialize(bia, "connect");
LOG.debug(
"Session establishment request from client {} client's lastZxid is 0x{}",
cnxn.getRemoteSocketAddress(),
Long.toHexString(connReq.getLastZxidSeen()));
//如果sessionId=0,表示是客户端发起的一个新的连接
//如果sessionId不等于0,表示是客户端的重连
//sessionid=0
long sessionId = connReq.getSessionId();
int tokensNeeded = 1;
//连接的限流器
if (connThrottle.isConnectionWeightEnabled()) {
if (sessionId == 0) {
if (localSessionEnabled) {
tokensNeeded = connThrottle.getRequiredTokensForLocal();
} else {
tokensNeeded = connThrottle.getRequiredTokensForGlobal();
}
} else {
tokensNeeded = connThrottle.getRequiredTokensForRenew();
}
}
if (!connThrottle.checkLimit(tokensNeeded)) {
throw new ClientCnxnLimitException();
}
ServerMetrics.getMetrics().CONNECTION_TOKEN_DEFICIT.add(connThrottle.getDeficit());
ServerMetrics.getMetrics().CONNECTION_REQUEST_COUNT.add(1);
boolean readOnly = false;
try {
readOnly = bia.readBool("readOnly");
cnxn.isOldClient = false;
} catch (IOException e) {
// this is ok -- just a packet from an old client which
// doesn't contain readOnly field
LOG.warn(
"Connection request from old client {}; will be dropped if server is in r-o mode",
cnxn.getRemoteSocketAddress());
}
if (!readOnly && this instanceof ReadOnlyZooKeeperServer) {
String msg = "Refusing session request for not-read-only client " + cnxn.getRemoteSocketAddress();
LOG.info(msg);
throw new CloseRequestException(msg, ServerCnxn.DisconnectReason.CLIENT_ZXID_AHEAD);
}
//客户端上的最近的zxid比服务端的还大,那肯定是有问题的,zxid是服务端产生的
if (connReq.getLastZxidSeen() > zkDb.dataTree.lastProcessedZxid) {
String msg = "Refusing session request for client "
+ cnxn.getRemoteSocketAddress()
+ " as it has seen zxid 0x"
+ Long.toHexString(connReq.getLastZxidSeen())
+ " our last zxid is 0x"
+ Long.toHexString(getZKDatabase().getDataTreeLastProcessedZxid())
+ " client must try another server";
LOG.info(msg);
throw new CloseRequestException(msg, ServerCnxn.DisconnectReason.NOT_READ_ONLY_CLIENT);
}
//客户端设置的sessionTimeout
int sessionTimeout = connReq.getTimeOut();
byte[] passwd = connReq.getPasswd();
//判断客户端的sessionTimeout和服务器的sessionTimeout,取小的
int minSessionTimeout = getMinSessionTimeout();
if (sessionTimeout < minSessionTimeout) {
sessionTimeout = minSessionTimeout;
}
int maxSessionTimeout = getMaxSessionTimeout();
if (sessionTimeout > maxSessionTimeout) {
sessionTimeout = maxSessionTimeout;
}
cnxn.setSessionTimeout(sessionTimeout);
//在连接还没有正式确定之前,不接受数据
// We don't want to receive any packets until we are sure that the
// session is setup
cnxn.disableRecv();
if (sessionId == 0) {
//创建一个session
long id = createSession(cnxn, passwd, sessionTimeout);
LOG.debug(
"Client attempting to establish new session: session = 0x{}, zxid = 0x{}, timeout = {}, address = {}",
Long.toHexString(id),
Long.toHexString(connReq.getLastZxidSeen()),
connReq.getTimeOut(),
cnxn.getRemoteSocketAddress());
} else {
long clientSessionId = connReq.getSessionId();
LOG.debug(
"Client attempting to renew session: session = 0x{}, zxid = 0x{}, timeout = {}, address = {}",
Long.toHexString(clientSessionId),
Long.toHexString(connReq.getLastZxidSeen()),
connReq.getTimeOut(),
cnxn.getRemoteSocketAddress());
if (serverCnxnFactory != null) {
serverCnxnFactory.closeSession(sessionId, ServerCnxn.DisconnectReason.CLIENT_RECONNECT);
}
if (secureServerCnxnFactory != null) {
secureServerCnxnFactory.closeSession(sessionId, ServerCnxn.DisconnectReason.CLIENT_RECONNECT);
}
cnxn.setSessionId(sessionId);
//告诉客户端
reopenSession(cnxn, sessionId, passwd, sessionTimeout);
ServerMetrics.getMetrics().CONNECTION_REVALIDATE_COUNT.add(1);
}
}
在这个方法里,zk服务器会初始化session。将客户端传过来的ConnectRequest反序列化,取出要用的进行初始化。BinaryInputArchive是zookeeper里面进行序列化的个实现类,带input的就是输入进来进行序列化的。然后会将缓冲区里读到的字节,反序列化为ConnectRequest对象它的属性
private int protocolVersion;
private long lastZxidSeen;
private int timeOut;
private long sessionId;
private byte[] passwd;
每个客户端对应着服务器里的一个sessionId。反正是会进行一系列的初始化嘛,然后连接初始化完成后,会发个响应给客户端。
然后是非连接的处理
关键是里面的submitRequest(si);
public void submitRequest(Request si) {
//添加到队列
enqueueRequest(si);
}
//队列加线程的模型
public void enqueueRequest(Request si) {
if (requestThrottler == null) {
synchronized (this) {
try {
// Since all requests are passed to the request
// processor it should wait for setting up the request
// processor chain. The state will be updated to RUNNING
// after the setup.
while (state == State.INITIAL) {
wait(1000);
}
} catch (InterruptedException e) {
LOG.warn("Unexpected interruption", e);
}
if (requestThrottler == null) {
throw new RuntimeException("Not started");
}
}
}
requestThrottler.submitRequest(si);
}
public void submitRequest(Request request) {
if (stopping) {
LOG.debug("Shutdown in progress. Request cannot be processed");
dropRequest(request);
} else {
request.requestThrottleQueueTime = Time.currentElapsedTime();
submittedRequests.add(request);
}
}
是添加到了submittedRequests的阻塞队列里。
public void run() {
try {
while (true) {
if (killed) {
break;
}
Request request = submittedRequests.take();
if (Request.requestOfDeath == request) {
break;
}
if (request.mustDrop()) {
continue;
}
// 当maxRequests = 0时,节流被禁用
//阈值10
if (maxRequests > 0) {
while (!killed) {
if (dropStaleRequests && request.isStale()) {
// Note: 这将关闭连接
dropRequest(request);
ServerMetrics.getMetrics().STALE_REQUESTS_DROPPED.add(1);
request = null;
break;
}
//真正被处理的请求
if (zks.getInProcess() < maxRequests) {
break;
}
//超过了则睡眠一会
throttleSleep(stallTime);
}
}
if (killed) {
break;
}
// A dropped stale request will be null
//进行处理
if (request != null) {
if (request.isStale()) {
ServerMetrics.getMetrics().STALE_REQUESTS.add(1);
}
final long elapsedTime = Time.currentElapsedTime() - request.requestThrottleQueueTime;
ServerMetrics.getMetrics().REQUEST_THROTTLE_QUEUE_TIME.add(elapsedTime);
if (shouldThrottleOp(request, elapsedTime)) {
request.setIsThrottled(true);
ServerMetrics.getMetrics().THROTTLED_OPS.add(1);
}
//真正处理请求的逻辑
zks.submitRequestNow(request);
}
}
} catch (InterruptedException e) {
LOG.error("Unexpected interruption", e);
}
int dropped = drainQueue();
LOG.info("RequestThrottler shutdown. Dropped {} requests", dropped);
}
RequestThrottler这个类是一个限流处理器,它也继承了Thread,所以看run方法,而在里面,可以看到就是取到了这个阻塞队列里的request。 其实在zookeeper里面很多地方都使用到了线程加阻塞队列的模型。这样起到的是一个异步效果。
而这个限流器也很容易懂,就是判断maxRequests,也就是在此服务器里的最大request数。如果超过了,就要睡眠;如果没有超过,就break,去进行处理request,除此之外,还要注意的是,当request的类型是closeSession时,会直接执行dropRequest(request);关闭此连接。
真正处理的逻辑是在zks.submitRequestNow(request);这里
public void submitRequestNow(Request si) {
if (firstProcessor == null) {
synchronized (this) {
try {
// Since all requests are passed to the request
// processor it should wait for setting up the request
// processor chain. The state will be updated to RUNNING
// after the setup.
while (state == State.INITIAL) {
wait(1000);
}
} catch (InterruptedException e) {
LOG.warn("Unexpected interruption", e);
}
if (firstProcessor == null || state != State.RUNNING) {
throw new RuntimeException("Not started");
}
}
}
try {
touch(si.cnxn);
boolean validpacket = Request.isValid(si.type);
if (validpacket) {
setLocalSessionFlag(si);
firstProcessor.processRequest(si);
if (si.cnxn != null) {
//请求数量加1
incInProcess();
}
} else {
LOG.warn("Received packet at server of unknown type {}", si.type);
// Update request accounting/throttling limits
requestFinished(si);
new UnimplementedRequestProcessor().processRequest(si);
}
} catch (MissingSessionException e) {
LOG.debug("Dropping request.", e);
// Update request accounting/throttling limits
requestFinished(si);
} catch (RequestProcessorException e) {
LOG.error("Unable to process request", e);
// Update request accounting/throttling limits
requestFinished(si);
}
}
啊这,突然发现搞错了,不应该先说集群服务器启动的,这个里面有一些没有搞很懂,先这样吧hhhhh。
单机服务器启动源码
单机服务器启动,就是在ZooKeeperServerMain的main方法入口里。也是对配置文件进行解析。然后根据解析文件的属性对服务器对象进行创建和初始化。
public void runFromConfig(ServerConfig config) throws IOException, AdminServerException {
LOG.info("Starting server");
FileTxnSnapLog txnLog = null;
try {
try {
metricsProvider = MetricsProviderBootstrap.startMetricsProvider(
config.getMetricsProviderClassName(),
config.getMetricsProviderConfiguration());
} catch (MetricsProviderLifeCycleException error) {
throw new IOException("Cannot boot MetricsProvider " + config.getMetricsProviderClassName(), error);
}
ServerMetrics.metricsProviderInitialized(metricsProvider);
// Note that this thread isn't going to be doing anything else,
// so rather than spawning another thread, we will just call
// run() in this thread.
// create a file logger url from the command line args
//创建Zookeeper数据管理器FileTxnSnapLog
txnLog = new FileTxnSnapLog(config.dataLogDir, config.dataDir);
//监控jvm是否暂停
JvmPauseMonitor jvmPauseMonitor = null;
if (config.jvmPauseMonitorToRun) {
jvmPauseMonitor = new JvmPauseMonitor(config);
}
//创建zookeeperServer实例
final ZooKeeperServer zkServer = new ZooKeeperServer(jvmPauseMonitor, txnLog, config.tickTime, config.minSessionTimeout, config.maxSessionTimeout, config.listenBacklog, null, config.initialConfig);
//创建服务器统计器ServerStats
txnLog.setServerStats(zkServer.serverStats());
// Registers shutdown handler which will be used to know the
// server error or shutdown state changes.
final CountDownLatch shutdownLatch = new CountDownLatch(1);
zkServer.registerServerShutdownHandler(new ZooKeeperServerShutdownHandler(shutdownLatch));
// Start Admin server
adminServer = AdminServerFactory.createAdminServer();
adminServer.setZooKeeperServer(zkServer);
adminServer.start();
boolean needStartZKServer = true;
if (config.getClientPortAddress() != null) {
//连接工厂,默认NIOServerCnxnFactory
cnxnFactory = ServerCnxnFactory.createFactory();
//初始化主线程,打开selector,并bind端口,打开NIO的Accept通知
cnxnFactory.configure(config.getClientPortAddress(), config.getMaxClientCnxns(), config.getClientPortListenBacklog(), false);
cnxnFactory.startup(zkServer);
// zkServer has been started. So we don't need to start it again in secureCnxnFactory.
needStartZKServer = false;
}
if (config.getSecureClientPortAddress() != null) {
//默认拿到NIOServerCnxnFactory
secureCnxnFactory = ServerCnxnFactory.createFactory();
//ServerSocketChannel bind地址和端口,设置最大客户端连接限制数
secureCnxnFactory.configure(config.getSecureClientPortAddress(), config.getMaxClientCnxns(), config.getClientPortListenBacklog(), true);
//直接startup到NIOServerCnxnFactory的startup方法
secureCnxnFactory.startup(zkServer, needStartZKServer);
}
//启动容器结点定时器
containerManager = new ContainerManager(
zkServer.getZKDatabase(),
zkServer.firstProcessor,
Integer.getInteger("znode.container.checkIntervalMs", (int) TimeUnit.MINUTES.toMillis(1)),
Integer.getInteger("znode.container.maxPerMinute", 10000),
Long.getLong("znode.container.maxNeverUsedIntervalMs", 0)
);
containerManager.start();
ZKAuditProvider.addZKStartStopAuditLog();
// Watch status of ZooKeeper server. It will do a graceful shutdown if the server is not running or hits an internal error.
//ZooKeeper服务器的监视状态。如果服务器未运行或遇到内部错误,它将正常关闭。
/**
* 比较优雅的关闭进程。当admin或者控制台发送一个命令过来,想要关闭server,
* 一个接受线程接受到放到队列里面,另外的个执行的线程执行这个时候发现要关闭进程,
* 则只需将这个CDL减1。await阻塞就会解开,然后就shutdown了
*/
shutdownLatch.await();
shutdown();
if (cnxnFactory != null) {
cnxnFactory.join();
}
if (secureCnxnFactory != null) {
secureCnxnFactory.join();
}
if (zkServer.canShutdown()) {
zkServer.shutdown(true);
}
} catch (InterruptedException e) {
// warn, but generally this is ok
LOG.warn("Server interrupted", e);
} finally {
if (txnLog != null) {
txnLog.close();
}
if (metricsProvider != null) {
try {
metricsProvider.stop();
} catch (Throwable error) {
LOG.warn("Error while stopping metrics", error);
}
}
}
}
这里是用到了一个CDL,如果想要关闭服务器,只需要传递一个命令过来,让这cdl-1就行。
重点是startup
public void startup(ZooKeeperServer zks, boolean startServer) throws IOException, InterruptedException {
//1.初始化workerService线程池
//2.启动SelectorThread,负责接收读写就绪事件
//3.启动AcceptThread,负责接收连接事件
start();
setZooKeeperServer(zks);
if (startServer) {
//初始化ZKdatabase,加载数据
//这里还会把之前的session给加载出来,载入到sessionswitchTimeouts中
zks.startdata();
//1.创建sessionTracker
//2.初始化RequestProcessor chain
//3.创建requestThrottler
//4.注册jmx
//5.修改为RUNNING状态
//6.notifyAll(),因为上面的步骤中启动线程,那些线程在运行的过程中如果发现一些其他的前置条件还没有满意
zks.startup();
}
}
start在集群里面也看过了,startdata也和集群差不多。在zks.startup里面做的事情我也写了注释
public synchronized void startup() {
if (sessionTracker == null) {
createSessionTracker();//创建会话跟踪器
}
startSessionTracker();//启动会话管理
setupRequestProcessors();//初始化请求处理链路
startRequestThrottler();
registerJMX();//注册jmx
startJvmPauseMonitor();
registerMetrics();
setState(State.RUNNING);//提供服务
requestPathMetricsCollector.start();
localSessionEnabled = sessionTracker.isLocalSessionsEnabled();
notifyAll();
}
这个会话跟踪器,会在后面文章写。重点是初始化请求处理链
protected void setupRequestProcessors() {//安装请求处理链路
RequestProcessor finalProcessor = new FinalRequestProcessor(this);
RequestProcessor syncProcessor = new SyncRequestProcessor(this, finalProcessor);
((SyncRequestProcessor) syncProcessor).start();
firstProcessor = new PrepRequestProcessor(this, syncProcessor);//链路是PrepRequestProcessor -> SyncRequestProcessor -> FinalRequestProcessor顺序
((PrepRequestProcessor) firstProcessor).start();//启动链路
}
有三个请求处理器,这三个应该了解zookeeper的都看过,请求再经过IO到服务器里面后,会先经过一个限流器,才会到这些处理器里,再写到集群最后的时候,就可以看到那个请求是会进入一个firstProcessor的队列里,而这个firstProcessor就是PrepRequestProcessor,第二个是Sync,第三个是Final。前两个都是我之前有说过的队列加线程模型。
所以可以看PrepRequestProcessor的run方法。
public void run() {
try {
while (true) { //无限循环
ServerMetrics.getMetrics().PREP_PROCESSOR_QUEUE_SIZE.add(submittedRequests.size());
//从队列中取出一个请求
Request request = submittedRequests.take();
ServerMetrics.getMetrics().PREP_PROCESSOR_QUEUE_TIME
.add(Time.currentElapsedTime() - request.prepQueueStartTime);
long traceMask = ZooTrace.CLIENT_REQUEST_TRACE_MASK;
if (request.type == OpCode.ping) { //请求类型为PING
traceMask = ZooTrace.CLIENT_PING_TRACE_MASK;
}
if (LOG.isTraceEnabled()) { //是否可追踪
ZooTrace.logRequest(LOG, traceMask, 'P', request, "");
}
if (Request.requestOfDeath == request) { //在关闭处理器之后,会添加requestOfDeath,表示关闭后不再处理请求
break;
}
request.prepStartTime = Time.currentElapsedTime();
//真正处理请求
pRequest(request);
}
} catch (Exception e) {
handleException(this.getName(), e);
}
LOG.info("PrepRequestProcessor exited loop!");
}
关键是这个pRequest
protected void pRequest(Request request) throws RequestProcessorException {
// LOG.info("Prep>>> cxid = " + request.cxid + " type = " +
// request.type + " id = 0x" + Long.toHexString(request.sessionId));
//将请求的hdr和txn设置为null
//清空,日志头和日志内容,下面会重新进行赋值
request.setHdr(null); //TxnHeader
request.setTxn(null); //Txn
if (!request.isThrottled()) {
pRequestHelper(request);
}
request.zxid = zks.getZxid();
ServerMetrics.getMetrics().PREP_PROCESS_TIME.add(Time.currentElapsedTime() - request.prepStartTime);
//prep请求处理器处理完了,交给下一个请求处理器。
nextProcessor.processRequest(request);
}
重点在pRequestHelper里,代码太多就不放上来了,往下点,看create类型的处理也就是pRequest2TxnCreate方法
/**
* prepRequestProcessor
*
* 过滤request,不是所有的request都是合法的,所以需要对Request进行合法的验证,验证通过后
* 对于Request就是要进行持久化了,所以prepRequestProcessor中也为持久化做一下准备,比如生成
* Txn和TxnHeader在持久化时直接从Request中获取这两个属性进行持久化就行了。
*
*
* 另外,Request持久化完成后,就要更新DataTree了,并且是根据Txn来更新DataTree
*
* 那么,为什么需要ChangeRecord呢?
* ChangeRecord表示修改记录,表示某个结点的修改记录,在处理Request时,需要依赖现有结点上的已有信息。
* 比如cversion(某个结点的孩子结点版本),比如在处理一个create请求时,需要修改父节点上的cversion(加1),
* 那么这个信息从哪来呢?一开始肯定是从DataTree上来,但是不能每次都从DataTree上来获取父节点的信息,这样
* 性能很慢,比如ZookeeperServer连续收到两个create请求,当某个create请求再被处理时,都需要先从DataTree获取
* 信息,然后持久化,然后更新DataTree,最后才能处理下一个create请求,是一个串行的过程,那么如果第二个create不合法呢?
* 按照上面的思路,则还需要等待第一个create请求处理完了之后才能对第二个请求进行验证,所以Zookeeper为了解决
* 这个问题,在PrepRequestProcessor中,每验证完一个请求,就把这个请求异步交给持久化线程来处理,PrepRequestProcessor
* 自己就处理下一个请求了,打断了串行的链路,但是这时候又出现了问题,因为在处理第二个create请求时需要依赖父节点的信息,
* 并且应该处理过第一个create请求后的结果,所以这时就引入了ChangeRecord,PrepRequestProcessor在处理第一个create
* 请求时,先生成一条ChangeRecord记录,然后再异步的去持久化和更新DataTree,然后立即去处理第二个create请求,此时就
* 可以不需要去取DataTree中的信息了(就算取了,可能取到的信息也不对),就直接取ChangeRecord中的信息就可以了
*/
private void pRequest2TxnCreate(int type, Request request, Record record, boolean deserialize) throws IOException, KeeperException {
if (deserialize) {//反序列化,将ByteBuffer转化成Record,等待持久化
//把字节数据,反序列化到record里
ByteBufferInputStream.byteBuffer2Record(request.request, record);
}
int flags;
String path;
List<ACL> acl;
byte[] data;
long ttl;
if (type == OpCode.createTTL) {//如果创建的是一个TTL结点,定时结点
CreateTTLRequest createTtlRequest = (CreateTTLRequest) record;
flags = createTtlRequest.getFlags();
path = createTtlRequest.getPath();
acl = createTtlRequest.getAcl();
data = createTtlRequest.getData();
ttl = createTtlRequest.getTtl();
} else {
CreateRequest createRequest = (CreateRequest) record;
flags = createRequest.getFlags();
//获取节点路径
path = createRequest.getPath();
acl = createRequest.getAcl();
data = createRequest.getData();
ttl = -1;
}
//获取创建模式,根据flags判断创建的是什么结点
CreateMode createMode = CreateMode.fromFlag(flags);
//验证一些东西
validateCreateRequest(path, createMode, request, ttl);
//获取父节点的路径
String parentPath = validatePathForCreate(path, request.sessionId);
List<ACL> listACL = fixupACL(path, request.authInfo, acl);
//因为添加一个结点后,要告知父节点新生成了一个子节点,修改父节点的一些属性
//获取父节点的Record
ChangeRecord parentRecord = getRecordForPath(parentPath);
//检查ACL列表
zks.checkACL(request.cnxn, parentRecord.acl, ZooDefs.Perms.CREATE, request.authInfo, path, listACL);
//获取父节点的Record的子节点版本号
int parentCVersion = parentRecord.stat.getCversion();
if (createMode.isSequential()) {//顺序模式
//在路径后添加一串数字
path = path + String.format(Locale.ENGLISH, "%010d", parentCVersion);
}
//验证路径
validatePath(path, request.sessionId);
try {
if (getRecordForPath(path) != null) {
throw new KeeperException.NodeExistsException(path);
}
} catch (KeeperException.NoNodeException e) {
// ignore this one
}
//父节点是否为临时节点
boolean ephemeralParent = EphemeralType.get(parentRecord.stat.getEphemeralOwner()) == EphemeralType.NORMAL;
if (ephemeralParent) {//父节点为临时节点
throw new KeeperException.NoChildrenForEphemeralsException(path);
}
//加了一个子节点,原来cversion的加一了
int newCversion = parentRecord.stat.getCversion() + 1;
if (type == OpCode.createContainer) {
//新节点的事务
request.setTxn(new CreateContainerTxn(path, data, listACL, newCversion));
} else if (type == OpCode.createTTL) {
request.setTxn(new CreateTTLTxn(path, data, listACL, newCversion, ttl));
} else {
request.setTxn(new CreateTxn(path, data, listACL, createMode.isEphemeral(), newCversion));
}
TxnHeader hdr = request.getHdr();
long ephemeralOwner = 0;
if (createMode.isContainer()) {
ephemeralOwner = EphemeralType.CONTAINER_EPHEMERAL_OWNER;
} else if (createMode.isTTL()) {
ephemeralOwner = EphemeralType.TTL.toEphemeralOwner(ttl);
} else if (createMode.isEphemeral()) {
ephemeralOwner = request.sessionId;
}
StatPersisted s = DataTree.createStat(hdr.getZxid(), hdr.getTime(), ephemeralOwner);
//拷贝
parentRecord = parentRecord.duplicate(request.getHdr().getZxid());
//子节点数量加1
parentRecord.childCount++;
//设置新的子节点版本号
parentRecord.stat.setCversion(newCversion);
parentRecord.stat.setPzxid(request.getHdr().getZxid());
parentRecord.precalculatedDigest = precalculateDigest(
DigestOpCode.UPDATE, parentPath, parentRecord.data, parentRecord.stat);
//将parentRecord添加至outstandingChanges和outstandingChangesForPath中
addChangeRecord(parentRecord);
ChangeRecord nodeRecord = new ChangeRecord(
request.getHdr().getZxid(), path, s, 0, listACL);
nodeRecord.data = data;
nodeRecord.precalculatedDigest = precalculateDigest(
DigestOpCode.ADD, path, nodeRecord.data, s);
setTxnDigest(request, nodeRecord.precalculatedDigest);
//将新生成的ChangeRecord(包含了StatPersisted信息)添加至outstandingChanges和outstandingChangesForPath中
addChangeRecord(nodeRecord);
}
这个我的注释已经写的很清楚了,看注释就ok了。
PrepRequestProcessor所做的
1.设置Request的Hdr和Txn
2.生成ChangeRecord提高效率,避免老从DataTree里取
3.验证Acl,也就是加了ACL的结点
然后就是会交给SyncRequestProcessor的个队列,也是线程不断从这个队列取request再执行。主要是对request进行持久化,再交给FinalRequestProcessor
FinalRequestProcessor做的
1.根据Request更新ZKDatabase
2.触发Watch
3.发送Response给客户端
后面两个Processor的源码后面会再写文章解析,zookeeper里面的Watch机制,和session会话跟踪,还有zookeeper的集群ZAB协议,后面都会写文章进行源码解析。
本人看源码水平不高,写文章水平也不高,勿喷。这批主要是写给自己看做一个总结的,可读性很低。