elasticsearch源码分析-06索引恢复

索引恢复

allocation阶段最后会向集群发布一个新的集群状态,状态发布后会进入IndicesClusterStateService的applyClusterState应用集群状态

@Override
public synchronized void applyClusterState(final ClusterChangedEvent event) {
    if (!lifecycle.started()) {
        return;
    }

    final ClusterState state = event.state();
    //我们需要清理这个节点上的分片和索引,因为我们
    // 一旦状态持久性被禁用(无主/未恢复),将再次恢复它们
    if (state.blocks().disableStatePersistence()) {
        for (AllocatedIndex<? extends Shard> indexService : indicesService) {
            // also cleans shards
            indicesService.removeIndex(indexService.index(), NO_LONGER_ASSIGNED, "cleaning index (disabled block persistence)");
        }
        return;
    }
	//从失败的分片缓存中删除不再由主节点分配给该节点的分片条目
    updateFailedShardsCache(state);

    //还删除已删除索引的分片
    deleteIndices(event);
    //删除索引分片不再分配给当前节点的索引分片
    removeIndices(event);
    //报告主节点分给当前节点的索引分片,但是不在当前节点
    failMissingShards(state);
    //删除任何与 master 期望不匹配的本地分片
    removeShards(state);   
    //也可以使分片失败,但这些都保证在 failedShardsCache
    updateIndices(event); 
    //创建索引
    createIndices(state);
    //创建或更新分片
    createOrUpdateShards(state);
}

删除不再由主节点分配给本节点分片条目

  • master发生变更路由表为空则删除所有数据
  • 移除不在本节点分配的信息
  • 将分配失败的shard发送给master由master进行处理
//从失败的分片缓存中删除不再由主节点分配给该节点的分片条目
updateFailedShardsCache(state);

private void updateFailedShardsCache(final ClusterState state) {
    RoutingNode localRoutingNode = state.getRoutingNodes().node(state.nodes().getLocalNodeId());
    //master发生变更,路由表是空的,则删除所有数据
    if (localRoutingNode == null) {
        failedShardsCache.clear();
        return;
    }

    DiscoveryNode masterNode = state.nodes().getMasterNode();

    // remove items from cache which are not in our routing table anymore and resend failures that have not executed on master yet
    //移除不在本节点分配的信息
    for (Iterator<Map.Entry<ShardId, ShardRouting>> iterator = failedShardsCache.entrySet().iterator(); iterator.hasNext(); ) {
        ShardRouting failedShardRouting = iterator.next().getValue();
        ShardRouting matchedRouting = localRoutingNode.getByShardId(failedShardRouting.shardId());
        if (matchedRouting == null || matchedRouting.isSameAllocation(failedShardRouting) == false) {
            iterator.remove();
        } else {
            if (masterNode != null) { // TODO: can we remove this? Is resending shard failures the responsibility of shardStateAction?
                String message = "master " + masterNode + " has not removed previously failed shard. resending shard failure";
                logger.trace("[{}] re-sending failed shard [{}], reason [{}]", matchedRouting.shardId(), matchedRouting, message);
                //将失败的shard发送给master,由master进行处理
                shardStateAction.localShardFailed(matchedRouting, message, null, SHARD_STATE_ACTION_LISTENER, state);
            }
        }
    }
}

执行删除索引处理

//删除已删除索引
deleteIndices(event);

private void deleteIndices(final ClusterChangedEvent event) {
    final ClusterState previousState = event.previousState();
    final ClusterState state = event.state();
    final String localNodeId = state.nodes().getLocalNodeId();
    assert localNodeId != null;
    //遍历待删除索引
    for (Index index : event.indicesDeleted()) {
        if (logger.isDebugEnabled()) {
            logger.debug("[{}] cleaning index, no longer part of the metadata", index);
        }
        //每个索引单独的服务
        AllocatedIndex<? extends Shard> indexService = indicesService.indexService(index);
        final IndexSettings indexSettings;
        if (indexService != null) {
            indexSettings = indexService.getIndexSettings();
            //删除索引
            indicesService.removeIndex(index, DELETED, "index no longer part of the metadata");
        } else if (previousState.metadata().hasIndex(index.getName())) {
            // The deleted index was part of the previous cluster state, but not loaded on the local node
            //previousState包含待删除的索引需要删除
            final IndexMetadata metadata = previousState.metadata().index(index);
            indexSettings = new IndexSettings(metadata, settings);
            indicesService.deleteUnassignedIndex("deleted index was not assigned to local node", metadata, state);
        } else {
            // The previous cluster state's metadata also does not contain the index,
            // which is what happens on node startup when an index was deleted while the
            // node was not part of the cluster.  In this case, try reading the index
            // metadata from disk.  If its not there, there is nothing to delete.
            // First, though, verify the precondition for applying this case by
            // asserting that the previous cluster state is not initialized/recovered.
            assert previousState.blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK);
            //验证索引是否删除
            final IndexMetadata metadata = indicesService.verifyIndexIsDeleted(index, event.state());
            if (metadata != null) {
                indexSettings = new IndexSettings(metadata, settings);
            } else {
                indexSettings = null;
            }
        }
        if (indexSettings != null) {
            threadPool.generic().execute(new AbstractRunnable() {
                @Override
                public void onFailure(Exception e) {
                    logger.warn(() -> new ParameterizedMessage("[{}] failed to complete pending deletion for index", index), e);
                }

                @Override
                protected void doRun() throws Exception {
                    try {
                        // we are waiting until we can lock the index / all shards on the node and then we ack the delete of the store
                        // to the master. If we can't acquire the locks here immediately there might be a shard of this index still
                        // holding on to the lock due to a "currently canceled recovery" or so. The shard will delete itself BEFORE the
                        // lock is released so it's guaranteed to be deleted by the time we get the lock
                        //执行索引删除
                        indicesService.processPendingDeletes(index, indexSettings, new TimeValue(30, TimeUnit.MINUTES));
                    } catch (ShardLockObtainFailedException exc) {
                        logger.warn("[{}] failed to lock all shards for index - timed out after 30 seconds", index);
                    } catch (InterruptedException e) {
                        logger.warn("[{}] failed to lock all shards for index - interrupted", index);
                    }
                }
            });
        }
    }
}

针对close操作进行remove操作释放资源,并不会删除数据

//主要处理索引关闭释放资源
removeIndices(event); 

通知主节点应该在此节点上处于活动状态的分片但是不存在的分片

failMissingShards(state);

删除没有没有分配到当前节点的shard和过时的shard

removeShards(state);
private void removeShards(final ClusterState state) {
    final String localNodeId = state.nodes().getLocalNodeId();
    assert localNodeId != null;

    // remove shards based on routing nodes (no deletion of data)
    //节点和分片映射关系
    RoutingNode localRoutingNode = state.getRoutingNodes().node(localNodeId);
    for (AllocatedIndex<? extends Shard> indexService : indicesService) {
        for (Shard shard : indexService) {
            ShardRouting currentRoutingEntry = shard.routingEntry();
            ShardId shardId = currentRoutingEntry.shardId();
            ShardRouting newShardRouting = localRoutingNode == null ? null : localRoutingNode.getByShardId(shardId);
            if (newShardRouting == null) {
                // we can just remove the shard without cleaning it locally, since we will clean it in IndicesStore
                // once all shards are allocated
                //删除没有分配给当前节点的shard
                logger.debug("{} removing shard (not allocated)", shardId);
                indexService.removeShard(shardId.id(), "removing shard (not allocated)");
            } else if (newShardRouting.isSameAllocation(currentRoutingEntry) == false) {
                logger.debug("{} removing shard (stale allocation id, stale {}, new {})", shardId,
                             currentRoutingEntry, newShardRouting);
                //删除过时的shard
                indexService.removeShard(shardId.id(), "removing shard (stale copy)");
            } else if (newShardRouting.initializing() && currentRoutingEntry.active()) {
                // this can happen if the node was isolated/gc-ed, rejoins the cluster and a new shard with the same allocation id
                // is assigned to it. Batch cluster state processing or if shard fetching completes before the node gets a new cluster
                // state may result in a new shard being initialized while having the same allocation id as the currently started shard.
                logger.debug("{} removing shard (not active, current {}, new {})", shardId, currentRoutingEntry, newShardRouting);
                //当节点离线后重新加入集群,可能导致分片的allocation id相同,则需要移除旧的shard
                indexService.removeShard(shardId.id(), "removing shard (stale copy)");
            } else if (newShardRouting.primary() && currentRoutingEntry.primary() == false && newShardRouting.initializing()) {
                assert currentRoutingEntry.initializing() : currentRoutingEntry; // see above if clause
                // this can happen when cluster state batching batches activation of the shard, closing an index, reopening it
                // and assigning an initializing primary to this node
                //批量更新时,关闭索引,重新打开
                logger.debug("{} removing shard (not active, current {}, new {})", shardId, currentRoutingEntry, newShardRouting);
                indexService.removeShard(shardId.id(), "removing shard (stale copy)");
            }
        }
    }
}

更新索引mappings和settings

//更新索引 settings、mapping 等信息
updateIndices(event);

创建索引服务

//创建索引
 createIndices(state
 
  private void createIndices(final ClusterState state) {
        // we only create indices for shards that are allocated
        //我们只为分配的分片创建索引
        //为分配给我们的shard创建索引
        RoutingNode localRoutingNode = state.getRoutingNodes().node(state.nodes().getLocalNodeId());
        if (localRoutingNode == null) {
            return;
        }
        // create map of indices to create with shards to fail if index creation fails
        //如果索引创建失败,则创建索引映射以使用分片创建失败
        final Map<Index, List<ShardRouting>> indicesToCreate = new HashMap<>();
        for (ShardRouting shardRouting : localRoutingNode) {
            if (failedShardsCache.containsKey(shardRouting.shardId()) == false) {
                final Index index = shardRouting.index();
                if (indicesService.indexService(index) == null) {
                    indicesToCreate.computeIfAbsent(index, k -> new ArrayList<>()).add(shardRouting);
                }
            }
        }

        for (Map.Entry<Index, List<ShardRouting>> entry : indicesToCreate.entrySet()) {
            final Index index = entry.getKey();
            final IndexMetadata indexMetadata = state.metadata().index(index);
            logger.debug("[{}] creating index", index);

            AllocatedIndex<? extends Shard> indexService = null;
            try {
                //创建索引
                indexService = indicesService.createIndex(indexMetadata, buildInIndexListener, true);
                if (indexService.updateMapping(null, indexMetadata) && sendRefreshMapping) {
                    nodeMappingRefreshAction.nodeMappingRefresh(state.nodes().getMasterNode(),
                        new NodeMappingRefreshAction.NodeMappingRefreshRequest(indexMetadata.getIndex().getName(),
                            indexMetadata.getIndexUUID(), state.nodes().getLocalNodeId())
                    );
                }
            } catch (Exception e) {
                final String failShardReason;
                if (indexService == null) {
                    failShardReason = "failed to create index";
                } else {
                    failShardReason = "failed to update mapping for index";
                    indicesService.removeIndex(index, FAILURE, "removing index (mapping update failed)");
                }
                for (ShardRouting shardRouting : entry.getValue()) {
                    sendFailShard(shardRouting, failShardReason, e, state);
                }
            }
        }
    }

创建索引方法起始就是创建indexService服务放入缓存,并没有执行真正的创建索引操作

@Override
public synchronized IndexService createIndex(
    final IndexMetadata indexMetadata, final List<IndexEventListener> builtInListeners,
    final boolean writeDanglingIndices) throws IOException {
    ensureChangesAllowed();
    //检查请求是否有效
    if (indexMetadata.getIndexUUID().equals(IndexMetadata.INDEX_UUID_NA_VALUE)) {
        throw new IllegalArgumentException("index must have a real UUID found value: [" + indexMetadata.getIndexUUID() + "]");
    }
    final Index index = indexMetadata.getIndex();
    //检查索引是否已经存在
    if (hasIndex(index)) {
        throw new ResourceAlreadyExistsException(index);
    }
    List<IndexEventListener> finalListeners = new ArrayList<>(builtInListeners);
    final IndexEventListener onStoreClose = new IndexEventListener() {
        @Override
        public void onStoreCreated(ShardId shardId) {
            indicesRefCount.incRef();
        }
        @Override
        public void onStoreClosed(ShardId shardId) {
            try {
                indicesQueryCache.onClose(shardId);
            } finally {
                indicesRefCount.decRef();
            }
        }
    };
    finalListeners.add(onStoreClose);
    finalListeners.add(oldShardsStats);
    //创建索引服务
    final IndexService indexService =
        createIndexService(
        CREATE_INDEX,
        indexMetadata,
        indicesQueryCache,
        indicesFieldDataCache,
        finalListeners,
        indexingMemoryController);
    boolean success = false;
    try {
        if (writeDanglingIndices && nodeWriteDanglingIndicesInfo) {
            indexService.addMetadataListener(imd -> updateDanglingIndicesInfo(index));
        }
        indexService.getIndexEventListener().afterIndexCreated(indexService);
        //存储索引服务
        indices = newMapBuilder(indices).put(index.getUUID(), indexService).immutableMap();
        if (writeDanglingIndices) {
            if (nodeWriteDanglingIndicesInfo) {
                updateDanglingIndicesInfo(index);
            } else {
                indexService.deleteDanglingIndicesInfo();
            }
        }
        success = true;
        return indexService;
    } finally {
        if (success == false) {
            indexService.close("plugins_failed", true);
        }
    }
}

执行真正的索引创建或更新索引

//创建或更新分片
createOrUpdateShards(state);

private void createOrUpdateShards(final ClusterState state) {
    RoutingNode localRoutingNode = state.getRoutingNodes().node(state.nodes().getLocalNodeId());
    if (localRoutingNode == null) {
        return;
    }

    DiscoveryNodes nodes = state.nodes();
    //索引分片访问路由表
    RoutingTable routingTable = state.routingTable();

    for (final ShardRouting shardRouting : localRoutingNode) {
        ShardId shardId = shardRouting.shardId();
        if (failedShardsCache.containsKey(shardId) == false) {
            AllocatedIndex<? extends Shard> indexService = indicesService.indexService(shardId.getIndex());
            assert indexService != null : "index " + shardId.getIndex() + " should have been created by createIndices";
            Shard shard = indexService.getShardOrNull(shardId.id());
            if (shard == null) {
                assert shardRouting.initializing() : shardRouting + " should have been removed by failMissingShards";
                //创建分片
                createShard(nodes, routingTable, shardRouting, state);
            } else {
                //更新分片
                updateShard(nodes, shardRouting, shard, routingTable, state);
            }
        }
    }
}

如果分配的shard在本节点之前没有则需要创建

private void createShard(DiscoveryNodes nodes, RoutingTable routingTable, ShardRouting shardRouting, ClusterState state) {
    assert shardRouting.initializing() : "only allow shard creation for initializing shard but was " + shardRouting;

    DiscoveryNode sourceNode = null;
    //需要从远程恢复
    if (shardRouting.recoverySource().getType() == Type.PEER)  {
        //查到主分片所在的节点
        sourceNode = findSourceNodeForPeerRecovery(logger, routingTable, nodes, shardRouting);
        if (sourceNode == null) {
            logger.trace("ignoring initializing shard {} - no source node can be found.", shardRouting.shardId());
            return;
        }
    }

    try {
        final long primaryTerm = state.metadata().index(shardRouting.index()).primaryTerm(shardRouting.id());
        logger.debug("{} creating shard with primary term [{}]", shardRouting.shardId(), primaryTerm);
        //跟踪与分片恢复相关的状态。
        RecoveryState recoveryState = new RecoveryState(shardRouting, nodes.getLocalNode(), sourceNode);
        //创建分片
        indicesService.createShard(
            shardRouting,
            recoveryState,
            recoveryTargetService,
            new RecoveryListener(shardRouting, primaryTerm),
            repositoriesService,
            failedShardHandler,
            globalCheckpointSyncer,
            retentionLeaseSyncer);
    } catch (Exception e) {
        failAndRemoveShard(shardRouting, true, "failed to create shard", e, state);
    }
}

根据uuid找到之前缓存起来的indexService服务执行创建

@Override
public IndexShard createShard(
    final ShardRouting shardRouting,
    final RecoveryState recoveryState,
    final PeerRecoveryTargetService recoveryTargetService,
    final PeerRecoveryTargetService.RecoveryListener recoveryListener,
    final RepositoriesService repositoriesService,
    final Consumer<IndexShard.ShardFailure> onShardFailure,
    final Consumer<ShardId> globalCheckpointSyncer,
    final RetentionLeaseSyncer retentionLeaseSyncer) throws IOException {
    Objects.requireNonNull(retentionLeaseSyncer);
    ensureChangesAllowed();
    //获取之前创建的indexService
    IndexService indexService = indexService(shardRouting.index());
    //创建shard
    IndexShard indexShard = indexService.createShard(shardRouting, globalCheckpointSyncer, retentionLeaseSyncer);
    indexShard.addShardFailureCallback(onShardFailure);
    //开始恢复
    indexShard.startRecovery(recoveryState, recoveryTargetService, recoveryListener, repositoriesService,
                             (type, mapping) -> {
                                 assert recoveryState.getRecoverySource().getType() == RecoverySource.Type.LOCAL_SHARDS:
                                 "mapping update consumer only required by local shards recovery";
                                 client.admin().indices().preparePutMapping()
                                     .setConcreteIndex(shardRouting.index()) // concrete index - no name clash, it uses uuid
                                     .setType(type)
                                     .setSource(mapping.source().string(), XContentType.JSON)
                                     .get();
                             }, this);
    return indexShard;
}


public synchronized IndexShard createShard(
            final ShardRouting routing,
            final Consumer<ShardId> globalCheckpointSyncer,
            final RetentionLeaseSyncer retentionLeaseSyncer) throws IOException {
        Objects.requireNonNull(retentionLeaseSyncer);
        /*
         * TODO: we execute this in parallel but it's a synced method. Yet, we might
         * be able to serialize the execution via the cluster state in the future. for now we just
         * keep it synced.
         */
        if (closed.get()) {
            throw new IllegalStateException("Can't create shard " + routing.shardId() + ", closed");
        }
        final Settings indexSettings = this.indexSettings.getSettings();
        //分片id
        final ShardId shardId = routing.shardId();
        boolean success = false;
        Store store = null;
        IndexShard indexShard = null;
        ShardLock lock = null;
        try {
            //shard创建锁
            lock = nodeEnv.shardLock(shardId, "shard creation", TimeUnit.SECONDS.toMillis(5));
            eventListener.beforeIndexShardCreated(shardId, indexSettings);
            ShardPath path;
            try {
                //创建shard路径
                path = ShardPath.loadShardPath(logger, nodeEnv, shardId, this.indexSettings.customDataPath());
            } catch (IllegalStateException ex) {
                logger.warn("{} failed to load shard path, trying to remove leftover", shardId);
                try {
                    ShardPath.deleteLeftoverShardDirectory(logger, nodeEnv, lock, this.indexSettings);
                    path = ShardPath.loadShardPath(logger, nodeEnv, shardId, this.indexSettings.customDataPath());
                } catch (Exception inner) {
                    ex.addSuppressed(inner);
                    throw ex;
                }
            }

            if (path == null) {
                // TODO: we should, instead, hold a "bytes reserved" of how large we anticipate this shard will be, e.g. for a shard
                // that's being relocated/replicated we know how large it will become once it's done copying:
                // Count up how many shards are currently on each data path:
                Map<Path, Integer> dataPathToShardCount = new HashMap<>();
                for (IndexShard shard : this) {
                    Path dataPath = shard.shardPath().getRootStatePath();
                    Integer curCount = dataPathToShardCount.get(dataPath);
                    if (curCount == null) {
                        curCount = 0;
                    }
                    dataPathToShardCount.put(dataPath, curCount + 1);
                }
                path = ShardPath.selectNewPathForShard(nodeEnv, shardId, this.indexSettings,
                    routing.getExpectedShardSize() == ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE
                        ? getAvgShardSizeInBytes() : routing.getExpectedShardSize(),
                    dataPathToShardCount);
                logger.debug("{} creating using a new path [{}]", shardId, path);
            } else {
                logger.debug("{} creating using an existing path [{}]", shardId, path);
            }

            if (shards.containsKey(shardId.id())) {
                throw new IllegalStateException(shardId + " already exists");
            }

            logger.debug("creating shard_id {}", shardId);
            // if we are on a shared FS we only own the shard (ie. we can safely delete it) if we are the primary.
            //如果我们在共享 FS 上,我们只拥有该分片(即,我们可以安全地删除它)如果我们是主分片
            final Engine.Warmer engineWarmer = (reader) -> {
                IndexShard shard =  getShardOrNull(shardId.getId());
                if (shard != null) {
                    warmer.warm(reader, shard, IndexService.this.indexSettings);
                }
            };
            //lucene存储
            Directory directory = directoryFactory.newDirectory(this.indexSettings, path);
            //es封装存储类
            store = new Store(shardId, this.indexSettings, directory, lock,
                    new StoreCloseListener(shardId, () -> eventListener.onStoreClosed(shardId)));
            eventListener.onStoreCreated(shardId);
            indexShard = new IndexShard(
                    routing,
                    this.indexSettings,
                    path,
                    store,
                    indexSortSupplier,
                    indexCache,
                    mapperService,
                    similarityService,
                    engineFactory,
                    eventListener,
                    readerWrapper,
                    threadPool,
                    bigArrays,
                    engineWarmer,
                    searchOperationListeners,
                    indexingOperationListeners,
                    () -> globalCheckpointSyncer.accept(shardId),
                    retentionLeaseSyncer,
                    circuitBreakerService);
            //发布事件
            eventListener.indexShardStateChanged(indexShard, null, indexShard.state(), "shard created");
            eventListener.afterIndexShardCreated(indexShard);
            shards = newMapBuilder(shards).put(shardId.id(), indexShard).immutableMap();
            success = true;
            return indexShard;
        } catch (ShardLockObtainFailedException e) {
            throw new IOException("failed to obtain in-memory shard lock", e);
        } finally {
            if (success == false) {
                if (lock != null) {
                    IOUtils.closeWhileHandlingException(lock);
                }
                closeShard("initialization failed", shardId, indexShard, store, eventListener);
            }
        }
    }

创建shard完成开始执行恢复流程。分片恢复分为主分片恢复和副本分片恢复,都是为了保证数据的完整性

  • 主分片恢复可以回放事务日志translog进行恢复
case EMPTY_STORE:
            case EXISTING_STORE://主分片从本地存储恢复,translog中恢复
                executeRecovery("from store", recoveryState, recoveryListener, this::recoverFromStore);
                break;

默认恢复阶段就是INI,T首先标记主分片进入RECOVERING状态,然后通过generic线程池运行恢复任务

private void executeRecovery(String reason, RecoveryState recoveryState, PeerRecoveryTargetService.RecoveryListener recoveryListener,
                                 CheckedConsumer<ActionListener<Boolean>, Exception> action) {
        //设置主分片的状态为RECOVERING
        markAsRecovering(reason, recoveryState); // mark the shard as recovering on the cluster state thread
        threadPool.generic().execute(ActionRunnable.wrap(ActionListener.wrap(r -> {
                if (r) {
                    //恢复成功向master发送action为internal:cluster/shard/started请求
                    recoveryListener.onRecoveryDone(recoveryState);
                }
            },
            //恢复失败,关闭引擎向master发送action为internal:cluster/shard/failure请求
            e -> recoveryListener.onRecoveryFailure(recoveryState, new RecoveryFailedException(recoveryState, null, e), true)), action));
    }

执行recover方法

//主分片恢复
public void recoverFromStore(ActionListener<Boolean> listener) {
    // we are the first primary, recover from the gateway
    // if its post api allocation, the index should exists
    assert shardRouting.primary() : "recover from store only makes sense if the shard is a primary shard";
    assert shardRouting.initializing() : "can only start recovery on initializing shard";
    StoreRecovery storeRecovery = new StoreRecovery(shardId, logger);
    storeRecovery.recoverFromStore(this, listener);
}

void recoverFromStore(final IndexShard indexShard, ActionListener<Boolean> listener) {
    if (canRecover(indexShard)) {//验证分片状态
        RecoverySource.Type recoveryType = indexShard.recoveryState().getRecoverySource().getType();
        assert recoveryType == RecoverySource.Type.EMPTY_STORE || recoveryType == RecoverySource.Type.EXISTING_STORE :
        "expected store recovery type but was: " + recoveryType;
        ActionListener.completeWith(recoveryListener(indexShard, listener), () -> {
            logger.debug("starting recovery from store ...");
            //从存储中恢复分片的状态。
            internalRecoverFromStore(indexShard);
            return true;
        });
    } else {
        listener.onResponse(false);
    }
}

接着调用internalRecoverFromStore方法执行恢复

private void internalRecoverFromStore(IndexShard indexShard) throws IndexShardRecoveryException {
    //恢复前回调
    indexShard.preRecovery();
    final RecoveryState recoveryState = indexShard.recoveryState();
    final boolean indexShouldExists = recoveryState.getRecoverySource().getType() != RecoverySource.Type.EMPTY_STORE;
    //index阶段
    indexShard.prepareForIndexRecovery();
    SegmentInfos si = null;
    //存储
    final Store store = indexShard.store();
    store.incRef();
    try {
        try {
            store.failIfCorrupted();
            try {
                //读取lucene最后一次提交的分段信息
                si = store.readLastCommittedSegmentsInfo();
            } catch (Exception e) {
                String files = "_unknown_";
                try {
                    files = Arrays.toString(store.directory().listAll());
                } catch (Exception inner) {
                    inner.addSuppressed(e);
                    files += " (failure=" + ExceptionsHelper.detailedMessage(inner) + ")";
                }
                if (indexShouldExists) {
                    throw new IndexShardRecoveryException(shardId,
                                                          "shard allocated for local recovery (post api), should exist, but doesn't, current files: " + files, e);
                }
            }
            if (si != null && indexShouldExists == false) {
                // it exists on the directory, but shouldn't exist on the FS, its a leftover (possibly dangling)
                // its a "new index create" API, we have to do something, so better to clean it than use same data
                logger.trace("cleaning existing shard, shouldn't exists");
                Lucene.cleanLuceneIndex(store.directory());
                si = null;
            }
        } catch (Exception e) {
            throw new IndexShardRecoveryException(shardId, "failed to fetch index version after copying it over", e);
        }
        //shrink 建立硬链接
        if (recoveryState.getRecoverySource().getType() == RecoverySource.Type.LOCAL_SHARDS) {
            assert indexShouldExists;
            bootstrap(indexShard, store);
            writeEmptyRetentionLeasesFile(indexShard);
        } else if (indexShouldExists) {
            if (recoveryState.getRecoverySource().shouldBootstrapNewHistoryUUID()) {
                store.bootstrapNewHistory();
                writeEmptyRetentionLeasesFile(indexShard);
            }
            // since we recover from local, just fill the files and size
            //由于我们从本地恢复,只需填写文件和大小
            try {
                final RecoveryState.Index index = recoveryState.getIndex();
                if (si != null) {
                    //根据最后一次分段信息,获取文件信息
                    addRecoveredFileDetails(si, store, index);
                }
            } catch (IOException e) {
                logger.debug("failed to list file details", e);
            }
        } else {
            //创建空的translog和lucene分段
            store.createEmpty(indexShard.indexSettings().getIndexVersionCreated().luceneVersion);
            final String translogUUID = Translog.createEmptyTranslog(
                indexShard.shardPath().resolveTranslog(), SequenceNumbers.NO_OPS_PERFORMED, shardId,
                indexShard.getPendingPrimaryTerm());
            //lucene存储中关联translogUUID
            store.associateIndexWithNewTranslog(translogUUID);
            writeEmptyRetentionLeasesFile(indexShard);
        }
        //从translog开始恢复
        indexShard.openEngineAndRecoverFromTranslog();
        //填充本地检查点和seqNo相同
        indexShard.getEngine().fillSeqNoGaps(indexShard.getPendingPrimaryTerm());
        //进入finalize阶段
        indexShard.finalizeRecovery();
        //进入最后一个done阶段
        indexShard.postRecovery("post recovery from shard_store");
    } catch (EngineException | IOException e) {
        throw new IndexShardRecoveryException(shardId, "failed to recover from gateway", e);
    } finally {
        store.decRef();
    }
}

首先简单校验恢复状态然后执行恢复前回调函数,然后标记进入index阶段,获取lucene最后一次提交的分段信息,获取其中的版本号
使用translog进行恢复

//从translog开始恢复
indexShard.openEngineAndRecoverFromTranslog();
//校验lucene index是否损坏,这个时间比较长,默认跳过
maybeCheckIndex();

进入VERIFY_INDEX阶段
本阶段主要的作用就是验证当前分片是否受损。校验就是通过对比元信息中记录的checksum和lucene文件的实际值,在索引数据量较大时,分片校验时间比较长默认不开启。如果需要校验通过index.shard.check_on_startup=true进行开启。

public void openEngineAndRecoverFromTranslog() throws IOException {
    assert recoveryState.getStage() == RecoveryState.Stage.INDEX : "unexpected recovery stage [" + recoveryState.getStage() + "]";
    //校验lucene index是否损坏,这个时间比较长,默认跳过
    maybeCheckIndex();
    //进入translog阶段
    recoveryState.setStage(RecoveryState.Stage.TRANSLOG);
    final RecoveryState.Translog translogRecoveryStats = recoveryState.getTranslog();
    final Engine.TranslogRecoveryRunner translogRecoveryRunner = (engine, snapshot) -> {
        //记录操作位置
        translogRecoveryStats.totalOperations(snapshot.totalOperations());
        translogRecoveryStats.totalOperationsOnStart(snapshot.totalOperations());
        return runTranslogRecovery(engine, snapshot, Engine.Operation.Origin.LOCAL_TRANSLOG_RECOVERY,
                                   translogRecoveryStats::incrementRecoveredOperations);
    };
    //加载全局检查点信息查看哪些数据已经刷盘
    loadGlobalCheckpointToReplicationTracker();
    innerOpenEngineAndTranslog(replicationTracker);
    //开始从translog恢复
    getEngine().recoverFromTranslog(translogRecoveryRunner, Long.MAX_VALUE);
}

进入TRANSLOG阶段
一个lucene索引由很多分段组成,数据会先写入操作系统cache,后台定时进行刷盘。为了数据不丢失,ES使用translog保证数据的完整性,数据写入内存同时为写一份日志到translog中。内部维护了一个checkpoint的信息,记录了lucene索引都包含哪些分段,这些分段已经执行了fsync将数据刷盘。本阶段需要重放事务日志中尚未刷盘的信息,根据最后一次提交的信息做快照,确定事务日志中哪些数据需要回放,回放之后的数据写入lucene刷入磁盘。

private void recoverFromTranslogInternal(TranslogRecoveryRunner translogRecoveryRunner, long recoverUpToSeqNo) throws IOException {
    final int opsRecovered;
    //获取本地检查点位置
    final long localCheckpoint = getProcessedLocalCheckpoint();
    //需要恢复的操作序列号比已刷盘的位置大
    if (localCheckpoint < recoverUpToSeqNo) {
        //读取translog
        try (Translog.Snapshot snapshot = translog.newSnapshot(localCheckpoint + 1, recoverUpToSeqNo)) {
            opsRecovered = translogRecoveryRunner.run(this, snapshot);
        } catch (Exception e) {
            throw new EngineException(shardId, "failed to recover from translog", e);
        }
    } else {
        opsRecovered = 0;
    }
    // flush if we recovered something or if we have references to older translogs
    // note: if opsRecovered == 0 and we have older translogs it means they are corrupted or 0 length.
    assert pendingTranslogRecovery.get() : "translogRecovery is not pending but should be";
    pendingTranslogRecovery.set(false); // we are good - now we can commit
    if (opsRecovered > 0) {
        logger.trace("flushing post recovery from translog: ops recovered [{}], current translog generation [{}]",
                     opsRecovered, translog.currentFileGeneration());
        //提交
        commitIndexWriter(indexWriter, translog, null);
        //刷新数据生成新的lucene分段
        refreshLastCommittedSegmentInfos();
        refresh("translog_recovery");
    }
    translog.trimUnreferencedReaders();
}

获取本地检查点记录的已刷盘的位置,然后和需要恢复数据的序列号进行比较,如果需要恢复数据就生成translog快照,然后执行回放

final Engine.TranslogRecoveryRunner translogRecoveryRunner = (engine, snapshot) -> {
    //记录操作位置
    translogRecoveryStats.totalOperations(snapshot.totalOperations());
    translogRecoveryStats.totalOperationsOnStart(snapshot.totalOperations());
    return runTranslogRecovery(engine, snapshot, Engine.Operation.Origin.LOCAL_TRANSLOG_RECOVERY,
                               translogRecoveryStats::incrementRecoveredOperations);
};

遍历所有需要回放的事务日志,然后将这些数据写入lucene中

int runTranslogRecovery(Engine engine, Translog.Snapshot snapshot, Engine.Operation.Origin origin,
                            Runnable onOperationRecovered) throws IOException {
    int opsRecovered = 0;
    Translog.Operation operation;
    while ((operation = snapshot.next()) != null) {
        try {
            logger.trace("[translog] recover op {}", operation);
            //重放translog数据开始恢复
            Engine.Result result = applyTranslogOperation(engine, operation, origin);
            switch (result.getResultType()) {
                case FAILURE:
                    throw result.getFailure();
                case MAPPING_UPDATE_REQUIRED:
                    throw new IllegalArgumentException("unexpected mapping update: " + result.getRequiredMappingUpdate());
                case SUCCESS:
                    break;
                default:
                    throw new AssertionError("Unknown result type [" + result.getResultType() + "]");
            }

            opsRecovered++;
            onOperationRecovered.run();
        } catch (Exception e) {
            // TODO: Don't enable this leniency unless users explicitly opt-in
            if (origin == Engine.Operation.Origin.LOCAL_TRANSLOG_RECOVERY && ExceptionsHelper.status(e) == RestStatus.BAD_REQUEST) {
                // mainly for MapperParsingException and Failure to detect xcontent
                logger.info("ignoring recovery of a corrupt translog entry", e);
            } else {
                throw ExceptionsHelper.convertToRuntime(e);
            }
        }
    }
    return opsRecovered;
}

提交数据,并更新内存中最后一次提交的分段信息,将数据刷入操作系统cache中

if (opsRecovered > 0) {
    logger.trace("flushing post recovery from translog: ops recovered [{}], current translog generation [{}]",
                 opsRecovered, translog.currentFileGeneration());
    //提交
    commitIndexWriter(indexWriter, translog, null);
    //刷新数据生成新的lucene分段
    refreshLastCommittedSegmentInfos();
    refresh("translog_recovery");
}

进入FINALIZE阶段

public void finalizeRecovery() {
    recoveryState().setStage(RecoveryState.Stage.FINALIZE);
    Engine engine = getEngine();
    engine.refresh("recovery_finalization");
    engine.config().setEnableGcDeletes(true);
}

执行refresh操作,将缓冲区数据写入文件,但不执行刷盘操作数据存储在操作系统cache中
进入DONE阶段

public void postRecovery(String reason) throws IndexShardStartedException, IndexShardRelocatedException, IndexShardClosedException {
    synchronized (postRecoveryMutex) {
        // we need to refresh again to expose all operations that were index until now. Otherwise
        // we may not expose operations that were indexed with a refresh listener that was immediately
        // responded to in addRefreshListener. The refresh must happen under the same mutex used in addRefreshListener
        // and before moving this shard to POST_RECOVERY state (i.e., allow to read from this shard).
        //再次刷新
        getEngine().refresh("post_recovery");
        synchronized (mutex) {
            if (state == IndexShardState.CLOSED) {
                throw new IndexShardClosedException(shardId);
            }
            if (state == IndexShardState.STARTED) {
                throw new IndexShardStartedException(shardId);
            }
            //设置done阶段
            recoveryState.setStage(RecoveryState.Stage.DONE);
            //更新分片状态
            changeState(IndexShardState.POST_RECOVERY, reason);
        }
    }
}

再次执行refresh操作,设置恢复分片进入done阶段,然后更新分片状态,到这里主分片恢复完毕。
回到IndexShard的executeRecovery方法,最后判断本地恢复的执行结果是成功还是失败都需要通知主节点

  • 恢复成功
//恢复成功向master发送action为internal:cluster/shard/started请求
recoveryListener.onRecoveryDone(recoveryState);

主节点注册的处理为ShardStartedTransportHandler

//分片启动处理
        transportService.registerRequestHandler(SHARD_STARTED_ACTION_NAME, ThreadPool.Names.SAME, StartedShardEntry::new,
            new ShardStartedTransportHandler(clusterService,
                new ShardStartedClusterStateTaskExecutor(allocationService, rerouteService, () -> followUpRerouteTaskPriority, logger),
                logger));

 @Override
public void messageReceived(StartedShardEntry request, TransportChannel channel, Task task) throws Exception {
    logger.debug("{} received shard started for [{}]", request.shardId, request);
    //集群提交分片启动任务
    clusterService.submitStateUpdateTask(
        "shard-started " + request,
        request,
        ClusterStateTaskConfig.build(Priority.URGENT),
        shardStartedClusterStateTaskExecutor,
        shardStartedClusterStateTaskExecutor);
    //返回空结果
    channel.sendResponse(TransportResponse.Empty.INSTANCE);
}

提交一个集群任务,执行在ShardStartedClusterStateTaskExecutor中,执行后会发布一个集群状态到集群中

public ClusterTasksResult<StartedShardEntry> execute(ClusterState currentState, List<StartedShardEntry> tasks) throws Exception {
    ClusterTasksResult.Builder<StartedShardEntry> builder = ClusterTasksResult.builder();
    List<StartedShardEntry> tasksToBeApplied = new ArrayList<>();
    List<ShardRouting> shardRoutingsToBeApplied = new ArrayList<>(tasks.size());
    Set<ShardRouting> seenShardRoutings = new HashSet<>(); // to prevent duplicates
    for (StartedShardEntry task : tasks) {
        //根据shard分配id获取路由信息
        final ShardRouting matched = currentState.getRoutingTable().getByAllocationId(task.shardId, task.allocationId);
        if (matched == null) {
            // tasks that correspond to non-existent shards are marked as successful. The reason is that we resend shard started
            // events on every cluster state publishing that does not contain the shard as started yet. This means that old stale
            // requests might still be in flight even after the shard has already been started or failed on the master. We just
            // ignore these requests for now.
            logger.debug("{} ignoring shard started task [{}] (shard does not exist anymore)", task.shardId, task);
            builder.success(task);
        } else {
            //主分片
            if (matched.primary() && task.primaryTerm > 0) {
                //索引元数据
                final IndexMetadata indexMetadata = currentState.metadata().index(task.shardId.getIndex());
                assert indexMetadata != null;
                final long currentPrimaryTerm = indexMetadata.primaryTerm(task.shardId.id());
                if (currentPrimaryTerm != task.primaryTerm) {
                    assert currentPrimaryTerm > task.primaryTerm : "received a primary term with a higher term than in the " +
                        "current cluster state (received [" + task.primaryTerm + "] but current is [" + currentPrimaryTerm + "])";
                    logger.debug("{} ignoring shard started task [{}] (primary term {} does not match current term {})",
                                 task.shardId, task, task.primaryTerm, currentPrimaryTerm);
                    builder.success(task);
                    continue;
                }
            }
            if (matched.initializing() == false) {
                assert matched.active() : "expected active shard routing for task " + task + " but found " + matched;
                // same as above, this might have been a stale in-flight request, so we just ignore.
                logger.debug("{} ignoring shard started task [{}] (shard exists but is not initializing: {})",
                             task.shardId, task, matched);
                builder.success(task);
            } else {
                // remove duplicate actions as allocation service expects a clean list without duplicates
                //删除重复操作,因为分配服务需要一个没有重复的干净列表
                //如果有就更新路由信息。没有就新增
                if (seenShardRoutings.contains(matched)) {
                    logger.trace("{} ignoring shard started task [{}] (already scheduled to start {})",
                                 task.shardId, task, matched);
                    tasksToBeApplied.add(task);
                } else {
                    logger.debug("{} starting shard {} (shard started task: [{}])", task.shardId, matched, task);
                    tasksToBeApplied.add(task);
                    shardRoutingsToBeApplied.add(matched);
                    seenShardRoutings.add(matched);
                }
            }
        }
    }
    assert tasksToBeApplied.size() >= shardRoutingsToBeApplied.size();

    ClusterState maybeUpdatedState = currentState;
    try {
        //构建分配分片后状态
        maybeUpdatedState = allocationService.applyStartedShards(currentState, shardRoutingsToBeApplied);
        builder.successes(tasksToBeApplied);
    } catch (Exception e) {
        logger.warn(() -> new ParameterizedMessage("failed to apply started shards {}", shardRoutingsToBeApplied), e);
        builder.failures(tasksToBeApplied, e);
    }

    return builder.build(maybeUpdatedState);
}
  • 恢复失败
//恢复失败,关闭引擎向master发送action为internal:cluster/shard/failure请求
            e -> recoveryListener.onRecoveryFailure(recoveryState, new RecoveryFailedException(recoveryState, null, e), true)), action));

删除恢复失败的分片,发送消息给master

private void failAndRemoveShard(ShardRouting shardRouting, boolean sendShardFailure, String message, @Nullable Exception failure,
                                    ClusterState state) {
    try {
        //删除分配失败的shard
        AllocatedIndex<? extends Shard> indexService = indicesService.indexService(shardRouting.shardId().getIndex());
        if (indexService != null) {
            Shard shard = indexService.getShardOrNull(shardRouting.shardId().id());
            if (shard != null && shard.routingEntry().isSameAllocation(shardRouting)) {
                indexService.removeShard(shardRouting.shardId().id(), message);
            }
        }
    } catch (ShardNotFoundException e) {
        // the node got closed on us, ignore it
    } catch (Exception inner) {
        inner.addSuppressed(failure);
        logger.warn(() -> new ParameterizedMessage(
            "[{}][{}] failed to remove shard after failure ([{}])",
            shardRouting.getIndexName(),
            shardRouting.getId(),
            message),
                    inner);
    }
    if (sendShardFailure) {
        //发送恢复失败的消息到master
        sendFailShard(shardRouting, message, failure, state);
    }
}

处理就是ShardFailedTransportHandler

transportService.registerRequestHandler(SHARD_FAILED_ACTION_NAME, ThreadPool.Names.SAME, FailedShardEntry::new,
            new ShardFailedTransportHandler(clusterService,
                new ShardFailedClusterStateTaskExecutor(allocationService, rerouteService, () -> followUpRerouteTaskPriority, logger),
                logger));

发布一个集群状态

  • 副本分片恢复

副本分片恢复比较复杂,在副本分片恢复期间允许新的写入操作,所以既需要拉取主分片的lucene分段也需要拉取主分片的translog日志。而主分片在写入数据后会将数据提交然后执行refresh操作写入操作系统cache,在执行刷盘后会将事务日志删除,这样如果事务日志没有发送到副本节点而而发送的lucene分段中又没有会导致数据不一致。在恢复期间需要保证translog不被清理

副本分片从远程恢复,source就是主分片,target为副本分片

case PEER://副本从远程主分片恢复
try {
    markAsRecovering("from " + recoveryState.getSourceNode(), recoveryState);
    //从副本从远程主分片恢复
    recoveryTargetService.startRecovery(this, recoveryState.getSourceNode(), recoveryListener);
} catch (Exception e) {
    failShard("corrupted preexisting index", e);
    recoveryListener.onRecoveryFailure(recoveryState,
                                       new RecoveryFailedException(recoveryState, null, e), true);
}
break;

副本分片恢复任务执行也在generic线程池中,初始恢复阶段就是INIT

@Override
public void doRun() {
    //开始恢复
    doRecovery(recoveryId);
}

进入INDEX阶段,将主分片的lucene数据同步到副分片节点

//恢复的源索引分片
final IndexShard indexShard = recoveryTarget.indexShard();
//分片恢复前回调
indexShard.preRecovery();
assert recoveryTarget.sourceNode() != null : "can not do a recovery without a source node";
logger.trace("{} preparing shard for peer recovery", recoveryTarget.shardId());
indexShard.prepareForIndexRecovery();

public void prepareForIndexRecovery() {
    if (state != IndexShardState.RECOVERING) {
        throw new IndexShardNotRecoveringException(shardId, state);
    }
    recoveryState.setStage(RecoveryState.Stage.INDEX);
    assert currentEngineReference.get() == null;
}

构造恢复请求

//构建请求
request = getStartRecoveryRequest(logger, clusterService.localNode(), recoveryTarget, startingSeqNo);

向主分片节点发送action为internal:index/shard/recovery/start_recovery的rpc请求,阻塞当前线程等待响应,直到主分片节点处理结束

//向主分片发送请求
                transportService.submitRequest(request.sourceNode(), PeerRecoverySourceService.Actions.START_RECOVERY, request,
                    new TransportResponseHandler<RecoveryResponse>() {
                        @Override
                        public void handleResponse(RecoveryResponse recoveryResponse) {
                            final TimeValue recoveryTime = new TimeValue(timer.time());
                            // do this through ongoing recoveries to remove it from the collection
                            onGoingRecoveries.markRecoveryAsDone(recoveryId);

                        @Override
                        public void handleException(TransportException e) {
                            handleException.accept(e);
                        }

                        @Override
                        public String executor() {
                            // we do some heavy work like refreshes in the response so fork off to the generic threadpool
                            return ThreadPool.Names.GENERIC;
                        }

                        @Override
                        public RecoveryResponse read(StreamInput in) throws IOException {
                            return new RecoveryResponse(in);
                        }
                    })
            );

主分片节点注册处理请求

transportService.registerRequestHandler(Actions.START_RECOVERY, ThreadPool.Names.GENERIC, StartRecoveryRequest::new,
            new StartRecoveryTransportRequestHandler());

class StartRecoveryTransportRequestHandler implements TransportRequestHandler<StartRecoveryRequest> {
    @Override
    public void messageReceived(final StartRecoveryRequest request, final TransportChannel channel, Task task) throws Exception {
        //副本分片恢复请求
        recover(request, new ChannelActionListener<>(channel, Actions.START_RECOVERY, request));
    }
}

主分片执行

private void recover(StartRecoveryRequest request, ActionListener<RecoveryResponse> listener) {
        final IndexService indexService = indicesService.indexServiceSafe(request.shardId().getIndex());
        final IndexShard shard = indexService.getShard(request.shardId().id());

        final ShardRouting routingEntry = shard.routingEntry();
        //主分片才能执行
        if (routingEntry.primary() == false || routingEntry.active() == false) {
            throw new DelayRecoveryException("source shard [" + routingEntry + "] is not an active primary");
        }

        if (request.isPrimaryRelocation() && (routingEntry.relocating() == false ||
            routingEntry.relocatingNodeId().equals(request.targetNode().getId()) == false)) {
            logger.debug("delaying recovery of {} as source shard is not marked yet as relocating to {}",
                request.shardId(), request.targetNode());
            throw new DelayRecoveryException("source shard is not marked yet as relocating to [" + request.targetNode() + "]");
        }

        RecoverySourceHandler handler = ongoingRecoveries.addNewRecovery(request, shard);
        logger.trace("[{}][{}] starting recovery to {}", request.shardId().getIndex().getName(), request.shardId().id(),
            request.targetNode());
        //处理副本分片恢复
        handler.recoverToTarget(ActionListener.runAfter(listener, () -> ongoingRecoveries.remove(shard, handler)));
    }
public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
        final Closeable releaseResources = () -> IOUtils.close(resources);
        final ActionListener<RecoveryResponse> wrappedListener = ActionListener.notifyOnce(listener);
        try {
            cancellableThreads.setOnCancel((reason, beforeCancelEx) -> {
                final RuntimeException e;
                if (shard.state() == IndexShardState.CLOSED) { // check if the shard got closed on us
                    e = new IndexShardClosedException(shard.shardId(), "shard is closed and recovery was canceled reason [" + reason + "]");
                } else {
                    e = new CancellableThreads.ExecutionCancelledException("recovery was canceled reason [" + reason + "]");
                }
                if (beforeCancelEx != null) {
                    e.addSuppressed(beforeCancelEx);
                }
                IOUtils.closeWhileHandlingException(releaseResources, () -> wrappedListener.onFailure(e));
                throw e;
            });
            final Consumer<Exception> onFailure = e -> {
                assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[onFailure]");
                IOUtils.closeWhileHandlingException(releaseResources, () -> wrappedListener.onFailure(e));
            };

            final boolean softDeletesEnabled = shard.indexSettings().isSoftDeleteEnabled();
            final SetOnce<RetentionLease> retentionLeaseRef = new SetOnce<>();

            runUnderPrimaryPermit(() -> {
                final IndexShardRoutingTable routingTable = shard.getReplicationGroup().getRoutingTable();
                ShardRouting targetShardRouting = routingTable.getByAllocationId(request.targetAllocationId());
                if (targetShardRouting == null) {
                    logger.debug("delaying recovery of {} as it is not listed as assigned to target node {}", request.shardId(),
                        request.targetNode());
                    throw new DelayRecoveryException("source node does not have the shard listed in its state as allocated on the node");
                }
                assert targetShardRouting.initializing() : "expected recovery target to be initializing but was " + targetShardRouting;
                retentionLeaseRef.set(
                    shard.getRetentionLeases().get(ReplicationTracker.getPeerRecoveryRetentionLeaseId(targetShardRouting)));
            }, shardId + " validating recovery target ["+ request.targetAllocationId() + "] registered ",
                shard, cancellableThreads, logger);
            final Engine.HistorySource historySource;
            if (softDeletesEnabled && (shard.useRetentionLeasesInPeerRecovery() || retentionLeaseRef.get() != null)) {
                historySource = Engine.HistorySource.INDEX;
            } else {
                historySource = Engine.HistorySource.TRANSLOG;
            }
            //获取translog文件锁,防止flush导致文件本清空
            final Closeable retentionLock = shard.acquireHistoryRetentionLock(historySource);
            resources.add(retentionLock);
            final long startingSeqNo;
            //可以基于SequenceNumber恢复
            final boolean isSequenceNumberBasedRecovery
                = request.startingSeqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO//seqNo不是未分配的序列号
                && isTargetSameHistory()//版本相同
                //给的seqNo主分片已经恢复
                && shard.hasCompleteHistoryOperations("peer-recovery", historySource, request.startingSeqNo())
                //可以从 translog 或 Lucene 索引中读取历史操作
                && (historySource == Engine.HistorySource.TRANSLOG ||
                   (retentionLeaseRef.get() != null && retentionLeaseRef.get().retainingSequenceNumber() <= request.startingSeqNo()));
            // NB check hasCompleteHistoryOperations when computing isSequenceNumberBasedRecovery, even if there is a retention lease,
            // because when doing a rolling upgrade from earlier than 7.4 we may create some leases that are initially unsatisfied. It's
            // possible there are other cases where we cannot satisfy all leases, because that's not a property we currently expect to hold.
            // Also it's pretty cheap when soft deletes are enabled, and it'd be a disaster if we tried a sequence-number-based recovery
            // without having a complete history.
            //可以基于SequenceNumber恢复,则可以将translog清空,不必持有锁了
            if (isSequenceNumberBasedRecovery && softDeletesEnabled && retentionLeaseRef.get() != null) {
                // all the history we need is retained by an existing retention lease, so we do not need a separate retention lock
                //我们需要的所有历史记录都由现有的保留租约保留,因此我们不需要单独的保留锁
                retentionLock.close();
                logger.trace("history is retained by {}", retentionLeaseRef.get());
            } else {
                // all the history we need is retained by the retention lock, obtained before calling shard.hasCompleteHistoryOperations()
                // and before acquiring the safe commit we'll be using, so we can be certain that all operations after the safe commit's
                // local checkpoint will be retained for the duration of this recovery.
                logger.trace("history is retained by retention lock");
            }

            final StepListener<SendFileResult> sendFileStep = new StepListener<>();
            final StepListener<TimeValue> prepareEngineStep = new StepListener<>();
            final StepListener<SendSnapshotResult> sendSnapshotStep = new StepListener<>();
            final StepListener<Void> finalizeStep = new StepListener<>();
            //可以基于SequenceNumber恢复
            if (isSequenceNumberBasedRecovery) {
                logger.trace("performing sequence numbers based recovery. starting at [{}]", request.startingSeqNo());
                startingSeqNo = request.startingSeqNo();
                if (retentionLeaseRef.get() == null) {
                    createRetentionLease(startingSeqNo, ActionListener.map(sendFileStep, ignored -> SendFileResult.EMPTY));
                } else {
                    sendFileStep.onResponse(SendFileResult.EMPTY);
                }
            } else {
                final Engine.IndexCommitRef safeCommitRef;
                try {
                    safeCommitRef = shard.acquireSafeIndexCommit();
                    resources.add(safeCommitRef);
                } catch (final Exception e) {
                    throw new RecoveryEngineException(shard.shardId(), 1, "snapshot failed", e);
                }

                // Try and copy enough operations to the recovering peer so that if it is promoted to primary then it has a chance of being
                // able to recover other replicas using operations-based recoveries. If we are not using retention leases then we
                // conservatively copy all available operations. If we are using retention leases then "enough operations" is just the
                // operations from the local checkpoint of the safe commit onwards, because when using soft deletes the safe commit retains
                // at least as much history as anything else. The safe commit will often contain all the history retained by the current set
                // of retention leases, but this is not guaranteed: an earlier peer recovery from a different primary might have created a
                // retention lease for some history that this primary already discarded, since we discard history when the global checkpoint
                // advances and not when creating a new safe commit. In any case this is a best-effort thing since future recoveries can
                // always fall back to file-based ones, and only really presents a problem if this primary fails before things have settled
                // down.
                //是否启用了软删除,如果是则全量赋值数据
                startingSeqNo = softDeletesEnabled
                    ? Long.parseLong(safeCommitRef.getIndexCommit().getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)) + 1L
                    : 0;
                logger.trace("performing file-based recovery followed by history replay starting at [{}]", startingSeqNo);

                try {
                    final int estimateNumOps = shard.estimateNumberOfHistoryOperations("peer-recovery", historySource, startingSeqNo);
                    final Releasable releaseStore = acquireStore(shard.store());
                    resources.add(releaseStore);
                    sendFileStep.whenComplete(r -> IOUtils.close(safeCommitRef, releaseStore), e -> {
                        try {
                            IOUtils.close(safeCommitRef, releaseStore);
                        } catch (final IOException ex) {
                            logger.warn("releasing snapshot caused exception", ex);
                        }
                    });

                    final StepListener<ReplicationResponse> deleteRetentionLeaseStep = new StepListener<>();
                    runUnderPrimaryPermit(() -> {
                            try {
                                // If the target previously had a copy of this shard then a file-based recovery might move its global
                                // checkpoint backwards. We must therefore remove any existing retention lease so that we can create a
                                // new one later on in the recovery.
                                shard.removePeerRecoveryRetentionLease(request.targetNode().getId(),
                                    new ThreadedActionListener<>(logger, shard.getThreadPool(), ThreadPool.Names.GENERIC,
                                        deleteRetentionLeaseStep, false));
                            } catch (RetentionLeaseNotFoundException e) {
                                logger.debug("no peer-recovery retention lease for " + request.targetAllocationId());
                                deleteRetentionLeaseStep.onResponse(null);
                            }
                        }, shardId + " removing retention lease for [" + request.targetAllocationId() + "]",
                        shard, cancellableThreads, logger);

                    deleteRetentionLeaseStep.whenComplete(ignored -> {
                        assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[phase1]");
                        //恢复阶段1,向副本节点发送lucene快照
                        phase1(safeCommitRef.getIndexCommit(), startingSeqNo, () -> estimateNumOps, sendFileStep);
                    }, onFailure);

                } catch (final Exception e) {
                    throw new RecoveryEngineException(shard.shardId(), 1, "sendFileStep failed", e);
                }
            }
            assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo;
            //准备translog
            sendFileStep.whenComplete(r -> {
                assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[prepareTargetForTranslog]");
                // For a sequence based recovery, the target can keep its local translog
                prepareTargetForTranslog(
                    shard.estimateNumberOfHistoryOperations("peer-recovery", historySource, startingSeqNo), prepareEngineStep);
            }, onFailure);

            prepareEngineStep.whenComplete(prepareEngineTime -> {
                assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[phase2]");
                /*
                 * add shard to replication group (shard will receive replication requests from this point on) now that engine is open.
                 * This means that any document indexed into the primary after this will be replicated to this replica as well
                 * make sure to do this before sampling the max sequence number in the next step, to ensure that we send
                 * all documents up to maxSeqNo in phase2.
                 */
                runUnderPrimaryPermit(() -> shard.initiateTracking(request.targetAllocationId()),
                    shardId + " initiating tracking of " + request.targetAllocationId(), shard, cancellableThreads, logger);

                final long endingSeqNo = shard.seqNoStats().getMaxSeqNo();
                logger.trace("snapshot translog for recovery; current size is [{}]",
                    shard.estimateNumberOfHistoryOperations("peer-recovery", historySource, startingSeqNo));
                //创建阶段2的事务日志快照
                final Translog.Snapshot phase2Snapshot = shard.getHistoryOperations("peer-recovery", historySource, startingSeqNo);
                resources.add(phase2Snapshot);
                retentionLock.close();

                // we have to capture the max_seen_auto_id_timestamp and the max_seq_no_of_updates to make sure that these values
                // are at least as high as the corresponding values on the primary when any of these operations were executed on it.
                final long maxSeenAutoIdTimestamp = shard.getMaxSeenAutoIdTimestamp();
                final long maxSeqNoOfUpdatesOrDeletes = shard.getMaxSeqNoOfUpdatesOrDeletes();
                final RetentionLeases retentionLeases = shard.getRetentionLeases();
                final long mappingVersionOnPrimary = shard.indexSettings().getIndexMetadata().getMappingVersion();
                phase2(startingSeqNo, endingSeqNo, phase2Snapshot, maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes,
                    retentionLeases, mappingVersionOnPrimary, sendSnapshotStep);
                sendSnapshotStep.whenComplete(
                    r -> IOUtils.close(phase2Snapshot),
                    e -> {
                        IOUtils.closeWhileHandlingException(phase2Snapshot);
                        onFailure.accept(new RecoveryEngineException(shard.shardId(), 2, "phase2 failed", e));
                    });

            }, onFailure);

            // Recovery target can trim all operations >= startingSeqNo as we have sent all these operations in the phase 2
            final long trimAboveSeqNo = startingSeqNo - 1;
            //标记结束
            sendSnapshotStep.whenComplete(r -> finalizeRecovery(r.targetLocalCheckpoint, trimAboveSeqNo, finalizeStep), onFailure);

            finalizeStep.whenComplete(r -> {
                final long phase1ThrottlingWaitTime = 0L; // TODO: return the actual throttle time
                final SendSnapshotResult sendSnapshotResult = sendSnapshotStep.result();
                final SendFileResult sendFileResult = sendFileStep.result();
                final RecoveryResponse response = new RecoveryResponse(sendFileResult.phase1FileNames, sendFileResult.phase1FileSizes,
                    sendFileResult.phase1ExistingFileNames, sendFileResult.phase1ExistingFileSizes, sendFileResult.totalSize,
                    sendFileResult.existingTotalSize, sendFileResult.took.millis(), phase1ThrottlingWaitTime,
                    prepareEngineStep.result().millis(), sendSnapshotResult.totalOperations, sendSnapshotResult.tookTime.millis());
                try {
                    wrappedListener.onResponse(response);
                } finally {
                    IOUtils.close(resources);
                }
            }, onFailure);
        } catch (Exception e) {
            IOUtils.closeWhileHandlingException(releaseResources, () -> wrappedListener.onFailure(e));
        }
    }

恢复核心处理过程由两个阶段组成

  • phase1
    获取translog保留锁,防止translog日志受刷盘影响被删除
//获取translog文件锁,防止flush导致文件本清空
            final Closeable retentionLock = shard.acquireHistoryRetentionLock(historySource);
            resources.add(retentionLock);

判断是否可以基于SequenceNumber恢复

final boolean isSequenceNumberBasedRecovery
                = request.startingSeqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO//seqNo不是未分配的序列号
                && isTargetSameHistory()//版本相同
                //给的seqNo主分片已经恢复
                && shard.hasCompleteHistoryOperations("peer-recovery", historySource, request.startingSeqNo())
                //可以从 translog 或 Lucene 索引中读取历史操作
                && (historySource == Engine.HistorySource.TRANSLOG ||
                   (retentionLeaseRef.get() != null && retentionLeaseRef.get().retainingSequenceNumber() <= request.startingSeqNo()));
if (isSequenceNumberBasedRecovery && softDeletesEnabled && retentionLeaseRef.get() != null) {   
    //所有历史记录都由现有的保留锁保留,因此不需要单独的保留锁
    retentionLock.close();
    logger.trace("history is retained by {}", retentionLeaseRef.get());
} else 
    logger.trace("history is retained by retention lock");
}

由于phase1发送大量数据,需要判断如果可以根据SequenceNumber恢复则跳过phase1

//可以基于SequenceNumber恢复
if (isSequenceNumberBasedRecovery) {
    logger.trace("performing sequence numbers based recovery. starting at [{}]", request.startingSeqNo());
    startingSeqNo = request.startingSeqNo();
    if (retentionLeaseRef.get() == null) {
        createRetentionLease(startingSeqNo, ActionListener.map(sendFileStep, ignored -> SendFileResult.EMPTY));
    } else {
        sendFileStep.onResponse(SendFileResult.EMPTY);
    }
}

如果不能则需要执行phase1方法

void phase1(IndexCommit snapshot, long startingSeqNo, IntSupplier translogOps, ActionListener<SendFileResult> listener) {
        cancellableThreads.checkForCancel();
        final Store store = shard.store();
        try {
            StopWatch stopWatch = new StopWatch().start();
            final Store.MetadataSnapshot recoverySourceMetadata;
            try {
                recoverySourceMetadata = store.getMetadata(snapshot);
            } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
                shard.failShard("recovery", ex);
                throw ex;
            }
            for (String name : snapshot.getFileNames()) {
                final StoreFileMetadata md = recoverySourceMetadata.get(name);
                if (md == null) {
                    logger.info("Snapshot differs from actual index for file: {} meta: {}", name, recoverySourceMetadata.asMap());
                    throw new CorruptIndexException("Snapshot differs from actual index - maybe index was removed metadata has " +
                            recoverySourceMetadata.asMap().size() + " files", name);
                }
            }
            //检查是否可以跳过阶段1,sync id相同就会跳过,表示5分钟没有数据写入了
            if (canSkipPhase1(recoverySourceMetadata, request.metadataSnapshot()) == false) {
                final List<String> phase1FileNames = new ArrayList<>();
                final List<Long> phase1FileSizes = new ArrayList<>();
                final List<String> phase1ExistingFileNames = new ArrayList<>();
                final List<Long> phase1ExistingFileSizes = new ArrayList<>();

                // Total size of segment files that are recovered
                long totalSizeInBytes = 0;
                // Total size of segment files that were able to be re-used
                long existingTotalSizeInBytes = 0;

                // Generate a "diff" of all the identical, different, and missing
                // segment files on the target node, using the existing files on
                // the source node
                //返回两个快照之间不同的数据用于恢复
                final Store.RecoveryDiff diff = recoverySourceMetadata.recoveryDiff(request.metadataSnapshot());
                //从副本已经存在的文件
                for (StoreFileMetadata md : diff.identical) {
                    phase1ExistingFileNames.add(md.name());
                    phase1ExistingFileSizes.add(md.length());
                    existingTotalSizeInBytes += md.length();
                    if (logger.isTraceEnabled()) {
                        logger.trace("recovery [phase1]: not recovering [{}], exist in local store and has checksum [{}]," +
                                        " size [{}]", md.name(), md.checksum(), md.length());
                    }
                    totalSizeInBytes += md.length();
                }
                //从副本已经存在但是和主分片不同的文件和不存在的文件
                List<StoreFileMetadata> phase1Files = new ArrayList<>(diff.different.size() + diff.missing.size());
                phase1Files.addAll(diff.different);
                phase1Files.addAll(diff.missing);
                for (StoreFileMetadata md : phase1Files) {
                    if (request.metadataSnapshot().asMap().containsKey(md.name())) {
                        logger.trace("recovery [phase1]: recovering [{}], exists in local store, but is different: remote [{}], local [{}]",
                            md.name(), request.metadataSnapshot().asMap().get(md.name()), md);
                    } else {
                        logger.trace("recovery [phase1]: recovering [{}], does not exist in remote", md.name());
                    }
                    phase1FileNames.add(md.name());
                    phase1FileSizes.add(md.length());
                    totalSizeInBytes += md.length();
                }

                logger.trace("recovery [phase1]: recovering_files [{}] with total_size [{}], reusing_files [{}] with total_size [{}]",
                    phase1FileNames.size(), new ByteSizeValue(totalSizeInBytes),
                    phase1ExistingFileNames.size(), new ByteSizeValue(existingTotalSizeInBytes));
                final StepListener<Void> sendFileInfoStep = new StepListener<>();
                final StepListener<Void> sendFilesStep = new StepListener<>();
                final StepListener<RetentionLease> createRetentionLeaseStep = new StepListener<>();
                final StepListener<Void> cleanFilesStep = new StepListener<>();
                cancellableThreads.checkForCancel();
                //通知目标将要接收的文件
                recoveryTarget.receiveFileInfo(phase1FileNames, phase1FileSizes, phase1ExistingFileNames,
                        phase1ExistingFileSizes, translogOps.getAsInt(), sendFileInfoStep);
                //发送文件
                sendFileInfoStep.whenComplete(r ->
                    sendFiles(store, phase1Files.toArray(new StoreFileMetadata[0]), translogOps, sendFilesStep), listener::onFailure);
                //创建租约
                sendFilesStep.whenComplete(r -> createRetentionLease(startingSeqNo, createRetentionLeaseStep), listener::onFailure);

                createRetentionLeaseStep.whenComplete(retentionLease ->
                    {
                        final long lastKnownGlobalCheckpoint = shard.getLastKnownGlobalCheckpoint();
                        assert retentionLease == null || retentionLease.retainingSequenceNumber() - 1 <= lastKnownGlobalCheckpoint
                            : retentionLease + " vs " + lastKnownGlobalCheckpoint;
                        // Establishes new empty translog on the replica with global checkpoint set to lastKnownGlobalCheckpoint. We want
                        // the commit we just copied to be a safe commit on the replica, so why not set the global checkpoint on the replica
                        // to the max seqno of this commit? Because (in rare corner cases) this commit might not be a safe commit here on
                        // the primary, and in these cases the max seqno would be too high to be valid as a global checkpoint.
                        cleanFiles(store, recoverySourceMetadata, translogOps, lastKnownGlobalCheckpoint, cleanFilesStep);
                    },
                    listener::onFailure);

                final long totalSize = totalSizeInBytes;
                final long existingTotalSize = existingTotalSizeInBytes;
                cleanFilesStep.whenComplete(r -> {
                    final TimeValue took = stopWatch.totalTime();
                    logger.trace("recovery [phase1]: took [{}]", took);
                    listener.onResponse(new SendFileResult(phase1FileNames, phase1FileSizes, totalSize, phase1ExistingFileNames,
                        phase1ExistingFileSizes, existingTotalSize, took));
                }, listener::onFailure);
            } else {
                logger.trace("skipping [phase1] since source and target have identical sync id [{}]", recoverySourceMetadata.getSyncId());
                //跳过阶段1
                // but we must still create a retention lease
                final StepListener<RetentionLease> createRetentionLeaseStep = new StepListener<>();
                createRetentionLease(startingSeqNo, createRetentionLeaseStep);
                createRetentionLeaseStep.whenComplete(retentionLease -> {
                    final TimeValue took = stopWatch.totalTime();
                    logger.trace("recovery [phase1]: took [{}]", took);
                    listener.onResponse(new SendFileResult(Collections.emptyList(), Collections.emptyList(), 0L, Collections.emptyList(),
                        Collections.emptyList(), 0L, took));
                }, listener::onFailure);

            }
        } catch (Exception e) {
            throw new RecoverFilesRecoveryException(request.shardId(), 0, new ByteSizeValue(0L), e);
        }
    }

为了解决副本分片恢复phase1阶段时间太长问题引入了synced flush,默认情况下5min没有写入索引操作,执行synced flush操作生成一个唯一的sync id写入分片的所有副本中,有相同sync id的分片具有相同的lucene索引
同样通过sync id也可以跳过phase1,如果主分片的sync id和副本分片sync id相同且doc数量相同可以跳过phase1
根据sync id判断不能跳过phase1

比较主分片和副本分片数据获取差异,有三种结果

  • 完全相同,两个快照完全相同的
  • 差别,源和目标都有,但是不是完全相同
  • 缺少,源有但是目标没有
//返回两个快照之间不同的数据用于恢复
                final Store.RecoveryDiff diff = recoverySourceMetadata.recoveryDiff(request.metadataSnapshot());

final List<String> phase1FileNames = new ArrayList<>();
                final List<Long> phase1FileSizes = new ArrayList<>();
                final List<String> phase1ExistingFileNames = new ArrayList<>();
                final List<Long> phase1ExistingFileSizes = new ArrayList<>();

                // Total size of segment files that are recovered
                long totalSizeInBytes = 0;
                // Total size of segment files that were able to be re-used
                long existingTotalSizeInBytes = 0;

                // Generate a "diff" of all the identical, different, and missing
                // segment files on the target node, using the existing files on
                // the source node
                //返回两个快照之间不同的数据用于恢复
                final Store.RecoveryDiff diff = recoverySourceMetadata.recoveryDiff(request.metadataSnapshot());
                //副本已经存在的文件
                for (StoreFileMetadata md : diff.identical) {
                    phase1ExistingFileNames.add(md.name());
                    phase1ExistingFileSizes.add(md.length());
                    existingTotalSizeInBytes += md.length();
                    if (logger.isTraceEnabled()) {
                        logger.trace("recovery [phase1]: not recovering [{}], exist in local store and has checksum [{}]," +
                                        " size [{}]", md.name(), md.checksum(), md.length());
                    }
                    totalSizeInBytes += md.length();
                }
                //主分片存在但是副本分片不存在或不同的数据
                List<StoreFileMetadata> phase1Files = new ArrayList<>(diff.different.size() + diff.missing.size());
                phase1Files.addAll(diff.different);
                phase1Files.addAll(diff.missing);
                for (StoreFileMetadata md : phase1Files) {
                    if (request.metadataSnapshot().asMap().containsKey(md.name())) {
                        logger.trace("recovery [phase1]: recovering [{}], exists in local store, but is different: remote [{}], local [{}]",
                            md.name(), request.metadataSnapshot().asMap().get(md.name()), md);
                    } else {
                        logger.trace("recovery [phase1]: recovering [{}], does not exist in remote", md.name());
                    }
                    phase1FileNames.add(md.name());
                    phase1FileSizes.add(md.length());
                    totalSizeInBytes += md.length();
                }

发送action为internal:index/shard/recovery/filesInfo的rpc调用,通知目标将要接收文件的文件

//通知目标将要接收的文件
recoveryTarget.receiveFileInfo(phase1FileNames, phase1FileSizes, phase1ExistingFileNames,
                               phase1ExistingFileSizes, translogOps.getAsInt(), sendFileInfoStep);

target处理在FilesInfoRequestHandler

//注册FilesInfo处理器
        transportService.registerRequestHandler(Actions.FILES_INFO, ThreadPool.Names.GENERIC, RecoveryFilesInfoRequest::new,
            new FilesInfoRequestHandler());

发送文件数据,将任务添加到AsyncIOProcessor中异步处理

void sendFiles(Store store, StoreFileMetadata[] files, IntSupplier translogOps, ActionListener<Void> listener) {
        ArrayUtil.timSort(files, Comparator.comparingLong(StoreFileMetadata::length)); // send smallest first
        final ThreadContext threadContext = threadPool.getThreadContext();
        final MultiFileTransfer<FileChunk> multiFileSender =
            new MultiFileTransfer<FileChunk>(logger, threadContext, listener, maxConcurrentFileChunks, Arrays.asList(files)) {

                final Deque<byte[]> buffers = new ConcurrentLinkedDeque<>();
                InputStreamIndexInput currentInput = null;
                long offset = 0;

                @Override
                protected void onNewFile(StoreFileMetadata md) throws IOException {
                    offset = 0;
                    IOUtils.close(currentInput, () -> currentInput = null);
                    final IndexInput indexInput = store.directory().openInput(md.name(), IOContext.READONCE);
                    currentInput = new InputStreamIndexInput(indexInput, md.length()) {
                        @Override
                        public void close() throws IOException {
                            IOUtils.close(indexInput, super::close); // InputStreamIndexInput's close is a noop
                        }
                    };
                }

                private byte[] acquireBuffer() {
                    final byte[] buffer = buffers.pollFirst();
                    if (buffer != null) {
                        return buffer;
                    }
                    return new byte[chunkSizeInBytes];
                }

                @Override
                protected FileChunk nextChunkRequest(StoreFileMetadata md) throws IOException {
                    assert Transports.assertNotTransportThread("read file chunk");
                    cancellableThreads.checkForCancel();
                    final byte[] buffer = acquireBuffer();
                    final int bytesRead = currentInput.read(buffer);
                    if (bytesRead == -1) {
                        throw new CorruptIndexException("file truncated; length=" + md.length() + " offset=" + offset, md.name());
                    }
                    final boolean lastChunk = offset + bytesRead == md.length();
                    final FileChunk chunk = new FileChunk(md, new BytesArray(buffer, 0, bytesRead), offset, lastChunk,
                        () -> buffers.addFirst(buffer));
                    offset += bytesRead;
                    return chunk;
                }

                @Override
                protected void executeChunkRequest(FileChunk request, ActionListener<Void> listener) {
                    cancellableThreads.checkForCancel();
                    //发送文件块
                    recoveryTarget.writeFileChunk(
                        request.md, request.position, request.content, request.lastChunk, translogOps.getAsInt(),
                        ActionListener.runBefore(listener, request::close));
                }

                @Override
                protected void handleError(StoreFileMetadata md, Exception e) throws Exception {
                    handleErrorOnSendFiles(store, e, new StoreFileMetadata[]{md});
                }

                @Override
                public void close() throws IOException {
                    IOUtils.close(currentInput, () -> currentInput = null);
                }
            };
        resources.add(multiFileSender);
        multiFileSender.start();
    }

遍历所有items调用executeChunkRequest

//发送文件块
recoveryTarget.writeFileChunk(
    request.md, request.position, request.content, request.lastChunk, translogOps.getAsInt(),
    ActionListener.runBefore(listener, request::close));

调用action为internal:index/shard/recovery/file_chunk的请求将数据发送到target
target处理在FileChunkTransportRequestHandler

//注册文件块FileChunk
        transportService.registerRequestHandler(Actions.FILE_CHUNK, ThreadPool.Names.GENERIC, RecoveryFileChunkRequest::new,
            new FileChunkTransportRequestHandler());

public void writeFileChunk(StoreFileMetadata fileMetadata, long position, BytesReference content,
                               boolean lastChunk, int totalTranslogOps, ActionListener<Void> listener) {
    try {
        state().getTranslog().totalOperations(totalTranslogOps);
        multiFileWriter.writeFileChunk(fileMetadata, position, content, lastChunk);
        listener.onResponse(null);
    } catch (Exception e) {
        listener.onFailure(e);
    }
}

等待所有文件都发送完毕后,发送action为internal:index/shard/recovery/clean_files的rpc请求,target可以处理哪些不在source存在的文件

处理在CleanFilesRequestHandler中

transportService.registerRequestHandler(Actions.CLEAN_FILES, ThreadPool.Names.GENERIC,
            RecoveryCleanFilesRequest::new, new CleanFilesRequestHandler());

进入VERIFY_INDEX阶段,如果设置校验则同样和主分片恢复一样校验索引,然后进入TRANSLOG阶段

如果根据sync id可以跳过phase1,则只执行创建租约

//跳过阶段1
                // but we must still create a retention lease
                final StepListener<RetentionLease> createRetentionLeaseStep = new StepListener<>();
                createRetentionLease(startingSeqNo, createRetentionLeaseStep);
                createRetentionLeaseStep.whenComplete(retentionLease -> {
                    final TimeValue took = stopWatch.totalTime();
                    logger.trace("recovery [phase1]: took [{}]", took);
                    listener.onResponse(new SendFileResult(Collections.emptyList(), Collections.emptyList(), 0L, Collections.emptyList(),
                        Collections.emptyList(), 0L, took));
                }, listener::onFailure);

到这里phase1执行结束

回到recoverToTarget方法,调用prepareTargetForTranslog方法发送action为internal:index/shard/recovery/prepare_translog的请求

//准备translog
            sendFileStep.whenComplete(r -> {
                assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[prepareTargetForTranslog]");
                // For a sequence based recovery, the target can keep its local translog
                prepareTargetForTranslog(
                    shard.estimateNumberOfHistoryOperations("peer-recovery", historySource, startingSeqNo), prepareEngineStep);
            }, onFailure);

target端处理在PrepareForTranslogOperationsRequestHandler

//注册准备transaction log
        transportService.registerRequestHandler(Actions.PREPARE_TRANSLOG, ThreadPool.Names.GENERIC,
                RecoveryPrepareForTranslogOperationsRequest::new, new PrepareForTranslogOperationsRequestHandler());

public void openEngineAndSkipTranslogRecovery() throws IOException {
    assert routingEntry().recoverySource().getType() == RecoverySource.Type.PEER : "not a peer recovery [" + routingEntry() + "]";
    assert recoveryState.getStage() == RecoveryState.Stage.TRANSLOG : "unexpected recovery stage [" + recoveryState.getStage() + "]";
    loadGlobalCheckpointToReplicationTracker();
    innerOpenEngineAndTranslog(replicationTracker);
    getEngine().skipTranslogRecovery();
}

可以看到副本分片主要处理就是启动engine,是副分片可以正常处理写入请求
创建事务日志快照

//创建阶段2的事务日志快照
                final Translog.Snapshot phase2Snapshot = shard.getHistoryOperations("peer-recovery", historySource, startingSeqNo);
  • 进入phase2,对translog创建快照,快照包含从phase1开始,到执行translog快照期间新增的索引数据,将这些translog发送副分片进行回放
phase2(startingSeqNo, endingSeqNo, phase2Snapshot, maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes,
                    retentionLeases, mappingVersionOnPrimary, sendSnapshotStep);

 //发送给副分片
sendBatch(
    readNextBatch,
    true,
    SequenceNumbers.UNASSIGNED_SEQ_NO,
    snapshot.totalOperations(),
    maxSeenAutoIdTimestamp,
    maxSeqNoOfUpdatesOrDeletes,
    retentionLeases,
    mappingVersion,
    batchedListener);

发送action为internal:index/shard/recovery/translog_ops的请求,副分片处理在TranslogOperationsRequestHandler

//注册从transaction log恢复处理器
        transportService.registerRequestHandler(Actions.TRANSLOG_OPS, ThreadPool.Names.GENERIC, RecoveryTranslogOperationsRequest::new,
            new TranslogOperationsRequestHandler());

然后主分片执行结束

//标记结束
            sendSnapshotStep.whenComplete(r -> finalizeRecovery(r.targetLocalCheckpoint, trimAboveSeqNo, finalizeStep), onFailure);

发送action为internal:index/shard/recovery/finalize的请求,副分片处理在FinalizeRecoveryRequestHandler,进入FINALIZE阶段

//注册结束恢复处理器
transportService.registerRequestHandler(Actions.FINALIZE, ThreadPool.Names.GENERIC, RecoveryFinalizeRecoveryRequest::new,new FinalizeRecoveryRequestHandler());
//进入FINALIZE
indexShard.finalizeRecovery();

到此主分片逻辑执行结束,副分片调用回调函数比较恢复进入DONE阶段

onGoingRecoveries.markRecoveryAsDone(recoveryId);

总结一下主分片主要处理流程是:

  • 获取保留锁
  • 判断是否可以基于序列化进行恢复
    • 如果可以跳过phase1
    • 如果不行则执行phase1方法,再次判断通过sync id是否可以跳过创建lucene快照发送副分片
      • 如果可以通过sync id恢复就跳过phase1
      • 如果不能跳过,则需要根据两个分片元信息比较两个分片的差异部分,构造数据发送副分片,首先发送文件信息,然后发送文件数据,最后通知副分片清理不一致的数据
  • 通知副分片启动engine
  • 发送translog快照数据
  • 执行phase2,发送translog快照数据
  • 结束阶段
    索引恢复过程是集群启动过程中最慢的过程,集群重启或发生master节点切换都需要执行恢复流程。我们也可以看到为了加速这个流程引入了sequenceNumber逻辑和sync id逻辑,都是尽量跳过phase1不发送lucene快照数据
    在这里插入图片描述

到这里集群完全启动

  • 23
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值