恢复机制比较复杂,慢慢写写。
概述
恢复的流程主要就是三个,一个replay,一个peersync还有一个replicate。下面结合场景,从源码角度分析三个数据恢复场景的流程。
replay
这个还是先从solr启动说起,分析一下整体的代码流程。
SolrDispatcher.init()
@Override
public void init(FilterConfig config) throws ServletException
{
log.info("SolrDispatchFilter.init(): {}", this.getClass().getClassLoader());
String exclude = config.getInitParameter("excludePatterns");
if(exclude != null) {
String[] excludeArray = exclude.split(",");
excludePatterns = new ArrayList<>();
for (String element : excludeArray) {
excludePatterns.add(Pattern.compile(element));
}
}
try {
Properties extraProperties = (Properties) config.getServletContext().getAttribute(PROPERTIES_ATTRIBUTE);
if (extraProperties == null)
extraProperties = new Properties();
String solrHome = (String) config.getServletContext().getAttribute(SOLRHOME_ATTRIBUTE);
ExecutorUtil.addThreadLocalProvider(SolrRequestInfo.getInheritableThreadLocalProvider());
this.cores = createCoreContainer(solrHome == null ? SolrResourceLoader.locateSolrHome() : Paths.get(solrHome),
extraProperties); // 创建一个CoreContainer
this.httpClient = cores.getUpdateShardHandler().getHttpClient();
log.info("user.dir=" + System.getProperty("user.dir"));
}
catch( Throwable t ) {
// catch this so our filter still works
log.error( "Could not start Solr. Check solr/home property and the logs");
SolrCore.log( t );
if (t instanceof Error) {
throw (Error) t;
}
}
log.info("SolrDispatchFilter.init() done");
}
protected CoreContainer createCoreContainer(Path solrHome, Properties extraProperties) {
NodeConfig nodeConfig = loadNodeConfig(solrHome, extraProperties);
cores = new CoreContainer(nodeConfig, extraProperties, true);
cores.load();
return cores;
}
CoreContainer.load()
......
for (final CoreDescriptor cd : cds) {
if (cd.isTransient() || !cd.isLoadOnStartup()) {
solrCores.putDynamicDescriptor(cd.getName(), cd);
} else if (asyncSolrCoreLoad) {
solrCores.markCoreAsLoading(cd);
}
if (cd.isLoadOnStartup()) {
futures.add(coreLoadExecutor.submit(() -> {
SolrCore core;
try {
if (zkSys.getZkController() != null) {
zkSys.getZkController().throwErrorIfReplicaReplaced(cd);
}
core = create(cd, false); // 创建,加载之前的数据
} finally {
if (asyncSolrCoreLoad) {
solrCores.markCoreAsNotLoading(cd);
}
}
try {
zkSys.registerInZk(core, true); // zk中进行注册
} catch (RuntimeException e) {
SolrException.log(log, "Error registering SolrCore", e);
}
return core;
}));
}
}
......
ZkContainer.registerInZk
// ZkContainer 注册
public void registerInZk(final SolrCore core, boolean background) {
Runnable r = () -> {
MDCLoggingContext.setCore(core);
try {
try {
zkController.register(core.getName(), core.getCoreDescriptor()); // 注册
} catch (InterruptedException e) {
// Restore the interrupted status
Thread.currentThread().interrupt();
SolrException.log(log, "", e);
} catch (Exception e) {
try {
zkController.publish(core.getCoreDescriptor(), Replica.State.DOWN);
} catch (InterruptedException e1) {
Thread.currentThread().interrupt();
log.error("", e1);
} catch (Exception e1) {
log.error("", e1);
}
SolrException.log(log, "", e);
}
} finally {
MDCLoggingContext.clear();
}
};
if (zkController != null) {
if (background) {
coreZkRegister.execute(r);
} else {
MDCLoggingContext.setCore(core);
try {
r.run();
} finally {
MDCLoggingContext.clear();
}
}
}
}
ZkController.register()
下面就是关键的流程,先进行节点的选举,然后开始恢复流程,其实这个恢复流程已经涵盖了上面所提到的三个场景。
//
public String register(String coreName, final CoreDescriptor desc, boolean recoverReloadedCores, boolean afterExpiration) throws Exception {
try (SolrCore core = cc.getCore(desc.getName())) {
MDCLoggingContext.setCore(core);
}
try {
......
......
try {
// If we're a preferred leader, insert ourselves at the head of the queue
boolean joinAtHead = false;
Replica replica = zkStateReader.getClusterState().getReplica(desc.getCloudDescriptor().getCollectionName(),
coreZkNodeName);
if (replica != null) {
joinAtHead = replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false);
}
joinElection(desc, afterExpiration, joinAtHead); // 先进行leader选举
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
} catch (KeeperException | IOException e) {
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
}
String leaderUrl = getLeader(cloudDesc, leaderVoteWait + 600000);
String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
log.info("We are " + ourUrl + " and leader is " + leaderUrl);
boolean isLeader = leaderUrl.equals(ourUrl);
try (SolrCore core = cc.getCore(desc.getName())) {
UpdateLog ulog = core.getUpdateHandler().getUpdateLog();
if (!afterExpiration && !core.isReloaded() && ulog != null) {
Slice slice = getClusterState().getSlice(collection, shardId);
if (slice.getState() != Slice.State.CONSTRUCTION || !isLeader) {
Future<UpdateLog.RecoveryInfo> recoveryFuture = core.getUpdateHandler().getUpdateLog().recoverFromLog(); // OK,此处就是一个replay操作的调用
// 因为在节点重启之前可能有未commit的数据,所以此处需要判断tlog是否有结束符,如果没有回进行一次数据回放。
if (recoveryFuture != null) {
log.info("Replaying tlog for " + ourUrl + " during startup... NOTE: This can take a while.");
recoveryFuture.get(); // NOTE: this could potentially block for
recoverFromLog
} else {
log.info("No LogReplay needed for core=" + core.getName() + " baseURL=" + baseUrl);
}
}
}
// 此处是一个标准的恢复流程,会先进性判断是否需要peerSync,如果需要回复的数据量超过100个doc,就直接进行replicate进行index文件的拷贝。
boolean didRecovery = checkRecovery(coreName, desc, recoverReloadedCores, isLeader, cloudDesc, collection,
coreZkNodeName, shardId, leaderProps, core, cc, afterExpiration);
if (!didRecovery) {
publish(desc, Replica.State.ACTIVE); // 如果不需要进行recovery,此处就可以将core置为active的状态,启动成功。
}
core.getCoreDescriptor().getCloudDescriptor().setHasRegistered(true);
}
// make sure we have an update cluster state right away
zkStateReader.forceUpdateCollection(collection);
return shardId;
} finally {
MDCLoggingContext.clear();
}
}
recoverFromLog
主要还是调用LogReplay来进行数据的回放。
public Future<RecoveryInfo> recoverFromLog() {
recoveryInfo = new RecoveryInfo();
List<TransactionLog> recoverLogs = new ArrayList<>(1);
for (TransactionLog ll : newestLogsOnStartup) {
if (!ll.try_incref()) continue;
try {
if (ll.endsWithCommit()) { // 如果有commit操作,文件末尾都会有一个标示符,这里也就是根据标示符来判断是否需要replay
ll.decref();
continue;
}
} catch (IOException e) {
log.error("Error inspecting tlog " + ll, e);
ll.decref();
continue;
}
recoverLogs.add(ll);
}
if (recoverLogs.isEmpty()) return null;
ExecutorCompletionService<RecoveryInfo> cs = new ExecutorCompletionService<>(recoveryExecutor);
LogReplayer replayer = new LogReplayer(recoverLogs, false); // 实现逻辑
versionInfo.blockUpdates();
try {
state = State.REPLAYING;
} finally {
versionInfo.unblockUpdates();
}
// At this point, we are guaranteed that any new updates coming in will see the state as "replaying"
return cs.submit(replayer, recoveryInfo); // 提交replay操作
}
LogReplayer
tlog中数据的回放,可能也包含了待删除的数据。
public void doReplay(TransactionLog translog) {
try {
loglog.warn("Starting log replay " + translog + " active=" + activeLog + " starting pos=" + recoveryInfo.positionOfStart);
long lastStatusTime = System.nanoTime();
tlogReader = translog.getReader(recoveryInfo.positionOfStart);
// NOTE: we don't currently handle a core reload during recovery. This would cause the core
// to change underneath us.
UpdateRequestProcessorChain processorChain = req.getCore().getUpdateProcessingChain(null);
UpdateRequestProcessor proc = processorChain.createProcessor(req, rsp); // 还是要构造一个processorChain进行数据的添加,不过这个操作只在本地进行
long commitVersion = 0;
int operationAndFlags = 0;
long nextCount = 0;
for (; ; ) { // 循环遍历所有需要回放的数据
Object o = null;
if (cancelApplyBufferUpdate) break;
try {
if (testing_logReplayHook != null) testing_logReplayHook.run();
if (nextCount++ % 1000 == 0) {
long now = System.nanoTime();
if (now - lastStatusTime > STATUS_TIME) {
lastStatusTime = now;
long cpos = tlogReader.currentPos();
long csize = tlogReader.currentSize();
loglog.info(
"log replay status {} active={} starting pos={} current pos={} current size={} % read={}",
translog, activeLog, recoveryInfo.positionOfStart, cpos, csize,
Math.floor(cpos / (double) csize * 100.));
}
}
o = null;
o = tlogReader.next();
if (o == null && activeLog) {
if (!finishing) {
versionInfo.blockUpdates();
finishing = true;
o = tlogReader.next();
} else {
}
} catch (Exception e) {
SolrException.log(log, e);
}
if (o == null) break;
try {
// should currently be a List<Oper,Ver,Doc/Id>
List entry = (List) o;
operationAndFlags = (Integer) entry.get(0);
int oper = operationAndFlags & OPERATION_MASK;
long version = (Long) entry.get(1);
switch (oper) { // 数据恢复的场景其实也包含了数据删除,删除的数据临时也会保存在内存中。
case UpdateLog.ADD: {
recoveryInfo.adds++;
// byte[] idBytes = (byte[]) entry.get(2);
SolrInputDocument sdoc = (SolrInputDocument) entry.get(entry.size() - 1);
AddUpdateCommand cmd = new AddUpdateCommand(req);
// cmd.setIndexedId(new BytesRef(idBytes));
cmd.solrDoc = sdoc;
cmd.setVersion(version);
cmd.setFlags(UpdateCommand.REPLAY | UpdateCommand.IGNORE_AUTOCOMMIT);
if (debug) log.debug("add " + cmd);
proc.processAdd(cmd);
break;
}
case UpdateLog.DELETE: {
recoveryInfo.deletes++;
byte[] idBytes = (byte[]) entry.get(2);
DeleteUpdateCommand cmd = new DeleteUpdateCommand(req);
cmd.setIndexedId(new BytesRef(idBytes));
cmd.setVersion(version);
cmd.setFlags(UpdateCommand.REPLAY | UpdateCommand.IGNORE_AUTOCOMMIT);
if (debug) log.debug("delete " + cmd);
proc.processDelete(cmd);
break;
}
case UpdateLog.DELETE_BY_QUERY: {
recoveryInfo.deleteByQuery++;
String query = (String) entry.get(2);
DeleteUpdateCommand cmd = new DeleteUpdateCommand(req);
cmd.query = query;
cmd.setVersion(version);
cmd.setFlags(UpdateCommand.REPLAY | UpdateCommand.IGNORE_AUTOCOMMIT);
if (debug) log.debug("deleteByQuery " + cmd);
proc.processDelete(cmd);
break;
}
case UpdateLog.COMMIT: {
commitVersion = version;
break;
}
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown Operation! " + oper);
}
if (rsp.getException() != null) {
loglog.error("REPLAY_ERR: Exception replaying log", rsp.getException());
throw rsp.getException();
}
} catch (IOException ex) {
recoveryInfo.errors++;
loglog.warn("REYPLAY_ERR: IOException reading log", ex);
// could be caused by an incomplete flush if recovering from log
} catch (ClassCastException cl) {
recoveryInfo.errors++;
loglog.warn("REPLAY_ERR: Unexpected log entry or corrupt log. Entry=" + o, cl);
// would be caused by a corrupt transaction log
} catch (SolrException ex) {
if (ex.code() == ErrorCode.SERVICE_UNAVAILABLE.code) {
throw ex;
}
recoveryInfo.errors++;
loglog.warn("REYPLAY_ERR: IOException reading log", ex);
// could be caused by an incomplete flush if recovering from log
} catch (Exception ex) {
recoveryInfo.errors++;
loglog.warn("REPLAY_ERR: Exception replaying log", ex);
// something wrong with the request?
}
assert TestInjection.injectUpdateLogReplayRandomPause();
}
// 最后做一次commit操作,保证数据已经落盘。
CommitUpdateCommand cmd = new CommitUpdateCommand(req, false);
cmd.setVersion(commitVersion);
cmd.softCommit = false;
cmd.waitSearcher = true;
cmd.setFlags(UpdateCommand.REPLAY);
try {
if (debug) log.debug("commit " + cmd);
uhandler.commit(cmd); // this should cause a commit to be added to the incomplete log and avoid it being replayed again after a restart.
} catch (IOException ex) {
recoveryInfo.errors++;
loglog.error("Replay exception: final commit.", ex);
}
if (!activeLog) {
translog.writeCommit(cmd, operationFlags | (operationAndFlags & ~OPERATION_MASK));
}
try {
proc.finish();
} catch (IOException ex) {
recoveryInfo.errors++;
loglog.error("Replay exception: finish()", ex);
}
} finally {
if (tlogReader != null) tlogReader.close();
translog.decref();
}
}
}