一个简单的wordcount demo
public class BatchWordCount { public static void main(String[] args) throws Exception { String inputPath = "D:\\outputDir\\file.txt"; String outPath = "D:\\outputDir\\result.txt"; ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); DataSource<String> source = env.readTextFile(inputPath); AggregateOperator<Tuple2<String, Integer>> operator = source.flatMap(new Tokenizer()).groupBy(0).sum(1); operator.writeAsCsv(outPath).setParallelism(1); env.execute(); } public static class Tokenizer implements FlatMapFunction<String, Tuple2<String, Integer>> { @Override public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception { Arrays.asList(value.split(" ")).forEach( i -> out.collect(new Tuple2(i, 1))); } } }
从 env.execute(); 开始:
如果我们不指定名字会有一个默认的名种。execute方法会触发程序执行,本地运行环境会触发所有带有sink操作的流程
execute的关键在于创建程序的执行计划和使用执行器去执行这个任务计划
创建程序的执行计划:
创建程序计划,这个计划描述了所有的数据源,数据的sink操作和 中间的operations以及他们之间是如何交互的,整个作为一个独立的单元来被org.apache.flink.api.common.PlanExecutor执行。通过使用执行器来获取计划并且执行它的这种运行方式是针对于分布操作组成的程序的另一种方式。
jobName – 任务名称
clearSinks – 是否启用一个新的执行阶段
public Plan createProgramPlan(String jobName, boolean clearSinks) { // 先判断是否有sink操作,没有的话就会抛出异常 if (this.sinks.isEmpty()) { if (wasExecuted) { throw new RuntimeException("No new data sinks have been defined since the " + "last execution. The last execution refers to the latest call to " + "'execute()', 'count()', 'collect()', or 'print()'."); } else { throw new RuntimeException("No data sinks have been created yet. " + "A program needs at least one sink that consumes data. " + "Examples are writing the data set or printing it."); } } // 如果你不指定名称,会自动生成一个名字 if (jobName == null) { jobName = getDefaultName(); } OperatorTranslation translator = new OperatorTranslation(); // 这里产生了执行计划 Plan plan = translator.translateToPlan(this.sinks, jobName); // 如果手动设置了并行度就在这里设置给执行计划 if (getParallelism() > 0) { plan.setDefaultParallelism(getParallelism()); } // 进行一些配置 plan.setExecutionConfig(getConfig()); // Check plan for GenericTypeInfo's and register the types at the serializers. if (!config.isAutoTypeRegistrationDisabled()) { plan.accept(new Visitor<org.apache.flink.api.common.operators.Operator<?>>() { private final Set<Class<?>> registeredTypes = new HashSet<>(); private final Set<org.apache.flink.api.common.operators.Operator<?>> visitedOperators = new HashSet<>(); @Override public boolean preVisit(org.apache.flink.api.common.operators.Operator<?> visitable) { if (!visitedOperators.add(visitable)) { return false; } OperatorInformation<?> opInfo = visitable.getOperatorInfo(); Serializers.recursivelyRegisterType(opInfo.getOutputType(), config, registeredTypes); return true; } @Override public void postVisit(org.apache.flink.api.common.operators.Operator<?> visitable) {} }); } try { registerCachedFilesWithPlan(plan); } catch (Exception e) { throw new RuntimeException("Error while registering cached files: " + e.getMessage(), e); } // clear all the sinks such that the next execution does not redo everything if (clearSinks) { this.sinks.clear(); wasExecuted = true; } // All types are registered now. Print information. int registeredTypes = config.getRegisteredKryoTypes().size() + config.getRegisteredPojoTypes().size() + config.getRegisteredTypesWithKryoSerializerClasses().size() + config.getRegisteredTypesWithKryoSerializers().size(); int defaultKryoSerializers = config.getDefaultKryoSerializers().size() + config.getDefaultKryoSerializerClasses().size(); LOG.info("The job has {} registered types and {} default Kryo serializers", registeredTypes, defaultKryoSerializers); if (config.isForceKryoEnabled() && config.isForceAvroEnabled()) { LOG.warn("In the ExecutionConfig, both Avro and Kryo are enforced. Using Kryo serializer"); } if (config.isForceKryoEnabled()) { LOG.info("Using KryoSerializer for serializing POJOs"); } if (config.isForceAvroEnabled()) { LOG.info("Using AvroSerializer for serializing POJOs"); } if (LOG.isDebugEnabled()) { LOG.debug("Registered Kryo types: {}", config.getRegisteredKryoTypes().toString()); LOG.debug("Registered Kryo with Serializers types: {}", config.getRegisteredTypesWithKryoSerializers().entrySet().toString()); LOG.debug("Registered Kryo with Serializer Classes types: {}", config.getRegisteredTypesWithKryoSerializerClasses().entrySet().toString()); LOG.debug("Registered Kryo default Serializers: {}", config.getDefaultKryoSerializers().entrySet().toString()); LOG.debug("Registered Kryo default Serializers Classes {}", config.getDefaultKryoSerializerClasses().entrySet().toString()); LOG.debug("Registered POJO types: {}", config.getRegisteredPojoTypes().toString()); // print information about static code analysis LOG.debug("Static code analysis mode: {}", config.getCodeAnalysisMode()); } return plan; }
执行任务计划
public JobExecutionResult executePlan(Plan plan) throws Exception { if (plan == null) { throw new IllegalArgumentException("The plan may not be null."); } synchronized (this.lock) { // check if we start a session dedicated for this execution final boolean shutDownAtEnd; if (jobExecutorService == null) { shutDownAtEnd = true; // configure the number of local slots equal to the parallelism of the local plan if (this.taskManagerNumSlots == DEFAULT_TASK_MANAGER_NUM_SLOTS) { int maxParallelism = plan.getMaximumParallelism(); if (maxParallelism > 0) { this.taskManagerNumSlots = maxParallelism; } } // start the cluster for us start(); } else { // we use the existing session shutDownAtEnd = false; } try { // TODO: Set job's default parallelism to max number of slots final int slotsPerTaskManager = jobExecutorServiceConfiguration.getInteger(TaskManagerOptions.NUM_TASK_SLOTS, taskManagerNumSlots); final int numTaskManagers = jobExecutorServiceConfiguration.getInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1); plan.setDefaultParallelism(slotsPerTaskManager * numTaskManagers); Optimizer pc = new Optimizer(new DataStatistics(), jobExecutorServiceConfiguration); OptimizedPlan op = pc.compile(plan); JobGraphGenerator jgg = new JobGraphGenerator(jobExecutorServiceConfiguration); JobGraph jobGraph = jgg.compileJobGraph(op, plan.getJobId()); return jobExecutorService.executeJobBlocking(jobGraph); } finally { if (shutDownAtEnd) { stop(); } } } }