Flink中JobGraph的构建

最新推荐文章于 2024-07-04 15:40:50 发布

前兄如后背

最新推荐文章于 2024-07-04 15:40:50 发布

阅读量763

点赞数 25

分类专栏： flink 文章标签： flink python 网络

本文链接：https://blog.csdn.net/m0_43437171/article/details/136100904

版权

flink 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

JobGraph的构建

进入env.execute()方法

 public JobExecutionResult execute(String jobName) throws Exception {
        final List<Transformation<?>> originalTransformations = new ArrayList<>(transformations);
        // 构建StreamGraph
        StreamGraph streamGraph = getStreamGraph();
        if (jobName != null) {
            streamGraph.setJobName(jobName);
        }

        try {
            return execute(streamGraph);
        } catch (Throwable t) {
            Optional<ClusterDatasetCorruptedException> clusterDatasetCorruptedException =
                    ExceptionUtils.findThrowable(t, ClusterDatasetCorruptedException.class);
            if (!clusterDatasetCorruptedException.isPresent()) {
                throw t;
            }

            // Retry without cache if it is caused by corrupted cluster dataset.
            invalidateCacheTransformations(originalTransformations);
            streamGraph = getStreamGraph(originalTransformations);
            // 获取构建好的StreamGraph，并执行StreamGraph
            // 进去execute()方法
            return execute(streamGraph);
        }
    }

进入execute()方法

    public JobExecutionResult execute(StreamGraph streamGraph) throws Exception {
    	// 进入executeAsync()异步方法
        final JobClient jobClient = executeAsync(streamGraph);

        try {
            final JobExecutionResult jobExecutionResult;

            if (configuration.getBoolean(DeploymentOptions.ATTACHED)) {
                jobExecutionResult = jobClient.getJobExecutionResult().get();
            } else {
                jobExecutionResult = new DetachedJobExecutionResult(jobClient.getJobID());
            }

            jobListeners.forEach(
                    jobListener -> jobListener.onJobExecuted(jobExecutionResult, null));

            return jobExecutionResult;
        } catch (Throwable t) {
            // get() on the JobExecutionResult Future will throw an ExecutionException. This
            // behaviour was largely not there in Flink versions before the PipelineExecutor
            // refactoring so we should strip that exception.
            Throwable strippedException = ExceptionUtils.stripExecutionException(t);

            jobListeners.forEach(
                    jobListener -> {
                        jobListener.onJobExecuted(null, strippedException);
                    });
            ExceptionUtils.rethrowException(strippedException);

            // never reached, only make javac happy
            return null;
        }
    }

public JobClient executeAsync(StreamGraph streamGraph) throws Exception {
        checkNotNull(streamGraph, "StreamGraph cannot be null.");
        final PipelineExecutor executor = getPipelineExecutor();
		// 异步提交得到future 进入execute()方法
        CompletableFuture<JobClient> jobClientFuture =
                executor.execute(streamGraph, configuration, userClassloader);

        try {
        	// 获取StreamGraph的执行结果
            JobClient jobClient = jobClientFuture.get();
            jobListeners.forEach(jobListener -> jobListener.onJobSubmitted(jobClient, null));
            collectIterators.forEach(iterator -> iterator.setJobClient(jobClient));
            collectIterators.clear();
            return jobClient;
        } catch (ExecutionException executionException) {
            final Throwable strippedException =
                    ExceptionUtils.stripExecutionException(executionException);
            jobListeners.forEach(
                    jobListener -> jobListener.onJobSubmitted(null, strippedException));

            throw new FlinkException(
                    String.format("Failed to execute job '%s'.", streamGraph.getJobName()),
                    strippedException);
        }
    }

该方法中的pipeline其实就是StreamGraph

进入getJobGraph()方法查看具体构建流程

public static JobGraph getJobGraph(
            @Nonnull final Pipeline pipeline,
            @Nonnull final Configuration configuration,
            @Nonnull ClassLoader userClassloader)
            throws MalformedURLException {
        checkNotNull(pipeline);
        checkNotNull(configuration);

        final ExecutionConfigAccessor executionConfigAccessor =
                ExecutionConfigAccessor.fromConfiguration(configuration);
                // 构建JobGraph 进入该方法
        final JobGraph jobGraph =
                FlinkPipelineTranslationUtil.getJobGraph(
                        userClassloader,
                        pipeline,
                        configuration,
                        executionConfigAccessor.getParallelism());

        configuration
                .getOptional(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID)
                .ifPresent(strJobID -> jobGraph.setJobID(JobID.fromHexString(strJobID)));

        if (configuration.getBoolean(DeploymentOptions.ATTACHED)
                && configuration.getBoolean(DeploymentOptions.SHUTDOWN_IF_ATTACHED)) {
            jobGraph.setInitialClientHeartbeatTimeout(
                    configuration.getLong(ClientOptions.CLIENT_HEARTBEAT_TIMEOUT));
        }

        jobGraph.addJars(executionConfigAccessor.getJars());
        jobGraph.setClasspaths(executionConfigAccessor.getClasspaths());
        jobGraph.setSavepointRestoreSettings(executionConfigAccessor.getSavepointRestoreSettings());

        return jobGraph;
    }

进入 FlinkPipelineTranslationUtil.getJobGraph()
JobGraph翻译器
进入pipelineTranslator.translateToJobGraph()具体实现类StreamGraphTranslator

public class StreamGraphTranslator implements FlinkPipelineTranslator {

    private static final Logger LOG = LoggerFactory.getLogger(StreamGraphTranslator.class);

    private final ClassLoader userClassloader;

    public StreamGraphTranslator(ClassLoader userClassloader) {
        this.userClassloader = userClassloader;
    }

    @Override
    public JobGraph translateToJobGraph(
            Pipeline pipeline, Configuration optimizerConfiguration, int defaultParallelism) {
        checkArgument(
                pipeline instanceof StreamGraph, "Given pipeline is not a DataStream StreamGraph.");

        StreamGraph streamGraph = (StreamGraph) pipeline;
        // 通过StreamGraph转换得到getJobGraph
        return streamGraph.getJobGraph(userClassloader, null);
    }
}

进入streamGraph.getJobGraph(userClassloader, null)方法

    public JobGraph getJobGraph(ClassLoader userClassLoader, @Nullable JobID jobID) {
        return StreamingJobGraphGenerator.createJobGraph(userClassLoader, this, jobID);
    }

private JobGraph createJobGraph() {
        preValidate();
        jobGraph.setJobType(streamGraph.getJobType());
        jobGraph.setDynamic(streamGraph.isDynamic());

        jobGraph.enableApproximateLocalRecovery(
                streamGraph.getCheckpointConfig().isApproximateLocalRecoveryEnabled());

        // Generate deterministic hashes for the nodes in order to identify them across
        // submission iff they didn't change.
        Map<Integer, byte[]> hashes =
                defaultStreamGraphHasher.traverseStreamGraphAndGenerateHashes(streamGraph);

        // Generate legacy version hashes for backwards compatibility
        List<Map<Integer, byte[]>> legacyHashes = new ArrayList<>(legacyStreamGraphHashers.size());
        for (StreamGraphHasher hasher : legacyStreamGraphHashers) {
            legacyHashes.add(hasher.traverseStreamGraphAndGenerateHashes(streamGraph));
        }
		// 合并算子链
		/**
		设置Chaining 将可以chain到一起的StreamNode chain在一起
		这里会生成相应的JobVertex、JobEdge、IntermediateDataSet对象
		把能chain在一起的Operator都合并了，变成了OperatorChain
		*/
        setChaining(hashes, legacyHashes);

        if (jobGraph.isDynamic()) {
            setVertexParallelismsForDynamicGraphIfNecessary();
        }

        // Note that we set all the non-chainable outputs configuration here because the
        // "setVertexParallelismsForDynamicGraphIfNecessary" may affect the parallelism of job
        // vertices and partition-reuse
        final Map<Integer, Map<StreamEdge, NonChainedOutput>> opIntermediateOutputs =
                new HashMap<>();
        setAllOperatorNonChainedOutputsConfigs(opIntermediateOutputs);
        setAllVertexNonChainedOutputsConfigs(opIntermediateOutputs);
		// 设置PhysicalEdge 将每个JobVertext的入边集合也序列化到该JobVertex的StreamConfig中
        setPhysicalEdges();

        markSupportingConcurrentExecutionAttempts();

        validateHybridShuffleExecuteInBatchMode();

        setSlotSharingAndCoLocation();

        setManagedMemoryFraction(
                Collections.unmodifiableMap(jobVertices),
                Collections.unmodifiableMap(vertexConfigs),
                Collections.unmodifiableMap(chainedConfigs),
                id -> streamGraph.getStreamNode(id).getManagedMemoryOperatorScopeUseCaseWeights(),
                id -> streamGraph.getStreamNode(id).getManagedMemorySlotScopeUseCases());

        configureCheckpointing();

        jobGraph.setSavepointRestoreSettings(streamGraph.getSavepointRestoreSettings());

        final Map<String, DistributedCache.DistributedCacheEntry> distributedCacheEntries =
                JobGraphUtils.prepareUserArtifactEntries(
                        streamGraph.getUserArtifacts().stream()
                                .collect(Collectors.toMap(e -> e.f0, e -> e.f1)),
                        jobGraph.getJobID());

        for (Map.Entry<String, DistributedCache.DistributedCacheEntry> entry :
                distributedCacheEntries.entrySet()) {
            jobGraph.addUserArtifact(entry.getKey(), entry.getValue());
        }

        // set the ExecutionConfig last when it has been finalized
        try {
            jobGraph.setExecutionConfig(streamGraph.getExecutionConfig());
        } catch (IOException e) {
            throw new IllegalConfigurationException(
                    "Could not serialize the ExecutionConfig."
                            + "This indicates that non-serializable types (like custom serializers) were registered");
        }

        jobGraph.setChangelogStateBackendEnabled(streamGraph.isChangelogStateBackendEnabled());

        addVertexIndexPrefixInVertexName();

        setVertexDescription();

        // Wait for the serialization of operator coordinators and stream config.
        try {
            FutureUtils.combineAll(
                            vertexConfigs.values().stream()
                                    .map(
                                            config ->
                                                    config.triggerSerializationAndReturnFuture(
                                                            serializationExecutor))
                                    .collect(Collectors.toList()))
                    .get();

            waitForSerializationFuturesAndUpdateJobVertices();
        } catch (Exception e) {
            throw new FlinkRuntimeException("Error in serialization.", e);
        }

        if (!streamGraph.getJobStatusHooks().isEmpty()) {
            jobGraph.setJobStatusHooks(streamGraph.getJobStatusHooks());
        }

        return jobGraph;
    }

进行StreamNode的合并如果判断相邻的两个StreamNode可以合并，则会合并为一个Operatorchain
1、如果该节点是一个chain的头节点，就会生成一个JobVertex
2、如果不是头结点，就要把自身配置并入头结点，然后把头结点和自己的输出边相连，对于不能chain的节点，当做只有头节点处理即可。
作用：减少线程之间的切换的性能消耗，较少数据缓冲区的交换，降低序列化反序列化的压力，同时也能减少延迟提升吞吐量。

直接进入SetChain()
在这里插入图片描述
进入CreateChain()

private List<StreamEdge> createChain(
            final Integer currentNodeId,
            final int chainIndex,
            final OperatorChainInfo chainInfo,
            final Map<Integer, OperatorChainInfo> chainEntryPoints) {

        Integer startNodeId = chainInfo.getStartNodeId();
        if (!builtVertices.contains(startNodeId)) {

            List<StreamEdge> transitiveOutEdges = new ArrayList<StreamEdge>();
			// 存储可chain的StreamEdge
            List<StreamEdge> chainableOutputs = new ArrayList<StreamEdge>();
            // 存储不可chain的StreamEdge
            List<StreamEdge> nonChainableOutputs = new ArrayList<StreamEdge>();
			// 当前要处理的StreamNode
            StreamNode currentNode = streamGraph.getStreamNode(currentNodeId);
			
			// 遍历当前StreamNode的边，通过边拿到两边的StreamNode在判断是否能够合并
            for (StreamEdge outEdge : currentNode.getOutEdges()) {
            // 判断一个StreamEdge连接的上下游Operator(StreamNode)是否可以chain在一起
                if (isChainable(outEdge, streamGraph)) {
                	// 将可合并的StreamEdge加入到集合
                    chainableOutputs.add(outEdge);
                } else {
                	// 将不可合并的StreamEdge加入到集合
                    nonChainableOutputs.add(outEdge);
                }
            }
			// 把可chain在一起的streamEdge 两边的Operator chain在一起形成合并算子链
            for (StreamEdge chainable : chainableOutputs) {
            // 这里进行了递归创建 如果可以chain在一起 这里的chainIndex+1
            // 最直观的表达就是 A B 两个算子进行了合并 会在判断是否能和之前的算子 C 进行合并
                transitiveOutEdges.addAll(
                        createChain(
                                chainable.getTargetId(),
                                chainIndex + 1,
                                chainInfo,
                                chainEntryPoints));
            }
			
            for (StreamEdge nonChainable : nonChainableOutputs) {
                transitiveOutEdges.add(nonChainable);
                createChain(
                        nonChainable.getTargetId(),
                        1, // operators start at position 1 because 0 is for chained source inputs
                        chainEntryPoints.computeIfAbsent(
                                nonChainable.getTargetId(),
                                (k) -> chainInfo.newChain(nonChainable.getTargetId())),
                        chainEntryPoints);
            }

            chainedNames.put(
                    currentNodeId,
                    createChainedName(
                            currentNodeId,
                            chainableOutputs,
                            Optional.ofNullable(chainEntryPoints.get(currentNodeId))));
            chainedMinResources.put(
                    currentNodeId, createChainedMinResources(currentNodeId, chainableOutputs));
            chainedPreferredResources.put(
                    currentNodeId,
                    createChainedPreferredResources(currentNodeId, chainableOutputs));

            OperatorID currentOperatorId =
                    chainInfo.addNodeToChain(
                            currentNodeId,
                            streamGraph.getStreamNode(currentNodeId).getOperatorName());

            if (currentNode.getInputFormat() != null) {
                getOrCreateFormatContainer(startNodeId)
                        .addInputFormat(currentOperatorId, currentNode.getInputFormat());
            }

            if (currentNode.getOutputFormat() != null) {
                getOrCreateFormatContainer(startNodeId)
                        .addOutputFormat(currentOperatorId, currentNode.getOutputFormat());
            }
			// 判断是否为合并算子链中的头节点 如果是头节点开始创建JobVertex
            StreamConfig config =
                    currentNodeId.equals(startNodeId)
                            ? createJobVertex(startNodeId, chainInfo)
                            : new StreamConfig(new Configuration());

            tryConvertPartitionerForDynamicGraph(chainableOutputs, nonChainableOutputs);

            setOperatorConfig(currentNodeId, config, chainInfo.getChainedSources());

            setOperatorChainedOutputsConfig(config, chainableOutputs);

            // we cache the non-chainable outputs here, and set the non-chained config later
            opNonChainableOutputsCache.put(currentNodeId, nonChainableOutputs);

            if (currentNodeId.equals(startNodeId)) {
                chainInfo.setTransitiveOutEdges(transitiveOutEdges);
                chainInfos.put(startNodeId, chainInfo);

                config.setChainStart();
                config.setChainIndex(chainIndex);
                config.setOperatorName(streamGraph.getStreamNode(currentNodeId).getOperatorName());
                config.setTransitiveChainedTaskConfigs(chainedConfigs.get(startNodeId));

            } else {
                chainedConfigs.computeIfAbsent(
                        startNodeId, k -> new HashMap<Integer, StreamConfig>());

                config.setChainIndex(chainIndex);
                StreamNode node = streamGraph.getStreamNode(currentNodeId);
                config.setOperatorName(node.getOperatorName());
                chainedConfigs.get(startNodeId).put(currentNodeId, config);
            }

            config.setOperatorID(currentOperatorId);

            if (chainableOutputs.isEmpty()) {
                config.setChainEnd();
            }
            return transitiveOutEdges;

        } else {
            return new ArrayList<>();
        }
    }

1、首先初始化了两个集合，来存储可chain和不可chain的StreamEdge，

2、然后获取到当前要处理的StreamNode

3、遍历当前StreamNode的边，来判断边两边上下游的StreamNode能否chain在一起，

4、将可以chain和不能chain的StreamEdge分别放入各自的集合

5、然后将可以chain的StreamNode，chain在一起形成一个OperatorChain，然后继续递归调用，判断chain完成后再下游的StreamNode能否继续chain在一起

6、将不能chain在一起的StreamNode取出，同样向下递归调用，判断下游的StreamNode能否和再下游的StreamNode合并。

7、在递归完成后判断当前节点是否是chain中的第一个StreamNode，如果是则开始构建JobVertex

8、同样判断当前节点是否是chain中的第一个StreamNode，如果是则开始构建JobEdge和IntermediateDataSet

进入isChainable() 查看合并条件

private static boolean isChainableInput(StreamEdge edge, StreamGraph streamGraph) {
        // TODO 获取上下游端点
        StreamNode upStreamVertex = streamGraph.getSourceVertex(edge);
        StreamNode downStreamVertex = streamGraph.getTargetVertex(edge);
 
        // TODO 判断是否能chain在一起
        if (!(
                // TODO 上下游算子实例处于同一个SlotSharingGroup中
                upStreamVertex.isSameSlotSharingGroup(downStreamVertex)
                        // TODO 这里面有3个条件
                && areOperatorsChainable(upStreamVertex, downStreamVertex, streamGraph)
                        // TODO 两个算子建的物理分区逻辑是 ForwardPartitioner
                && (edge.getPartitioner() instanceof ForwardPartitioner)
                        // TODO 两个算子间的shuffle方式不等于批处理模式
                && edge.getShuffleMode() != ShuffleMode.BATCH
                        // TODO 上下游算子实例的并行度相同
                && upStreamVertex.getParallelism() == downStreamVertex.getParallelism()
                        // TODO 启动了chain
                && streamGraph.isChainingEnabled())) {
 
            return false;
        }
 
        // check that we do not have a union operation, because unions currently only work
        // through the network/byte-channel stack.
        // we check that by testing that each "type" (which means input position) is used only once
        for (StreamEdge inEdge : downStreamVertex.getInEdges()) {
            if (inEdge != edge && inEdge.getTypeNumber() == edge.getTypeNumber()) {
                return false;
            }
        }
        return true;
    }

进入areOperatorsChainable()

@VisibleForTesting
    static boolean areOperatorsChainable(
            StreamNode upStreamVertex, StreamNode downStreamVertex, StreamGraph streamGraph) {
        // TODO 前后算子不能为空
        StreamOperatorFactory<?> upStreamOperator = upStreamVertex.getOperatorFactory();
        StreamOperatorFactory<?> downStreamOperator = downStreamVertex.getOperatorFactory();
        if (downStreamOperator == null || upStreamOperator == null) {
            return false;
        }
 
        // yielding operators cannot be chained to legacy sources
        // unfortunately the information that vertices have been chained is not preserved at this
        // point
        if (downStreamOperator instanceof YieldingOperatorFactory
                && getHeadOperator(upStreamVertex, streamGraph).isLegacySource()) {
            return false;
        }
 
        // we use switch/case here to make sure this is exhaustive if ever values are added to the
        // ChainingStrategy enum
        boolean isChainable;
 
        // TODO 上游节点的chain策略为ALWAYS或HEAD(HEAD只能与下游连接,不能与上游连接,Source默认是HEAD)
        switch (upStreamOperator.getChainingStrategy()) {
            // TODO NEVER 表示该运算符将不会被链接到之前或之后的运算符
            case NEVER:
                isChainable = false;
                break;
                // TODO ALWAYS 表示 Operators将竭尽所能的连接在一起
            case ALWAYS:
                // TODO 运算符不会连接到上游,但是下游算子可以连接到此运算符
            case HEAD:
            case HEAD_WITH_SOURCES:
                isChainable = true;
                break;
            default:
                throw new RuntimeException(
                        "Unknown chaining strategy: " + upStreamOperator.getChainingStrategy());
        }
 
        // TODO 下游节点的chain策略为ALWAYS(可以与上下游连接,map、flatmap、filter等默认是ALWAYS)
        switch (downStreamOperator.getChainingStrategy()) {
            case NEVER:
            case HEAD:
                isChainable = false;
                break;
            case ALWAYS:
                // keep the value from upstream
                break;
            case HEAD_WITH_SOURCES:
                // only if upstream is a source
                isChainable &= (upStreamOperator instanceof SourceOperatorFactory);
                break;
            default:
                throw new RuntimeException(
                        "Unknown chaining strategy: " + upStreamOperator.getChainingStrategy());
        }
 
        return isChainable;
    }

总结：