https://www.cnblogs.com/bethunebtj/p/9168274.html
写在最前:因为这篇博客太长,所以我把它转成了带书签的pdf格式,看起来更方便一点。想要的童鞋可以到我的公众号“老白讲互联网”后台留言flink即可获取。
追源索骥:透过源码看懂Flink核心框架的执行流程
flink
前言
Flink是大数据处理领域最近很火的一个开源的分布式、高性能的流式处理框架,其对数据的处理可以达到毫秒级别。本文以一个来自官网的WordCount例子为引,全面阐述flink的核心架构及执行流程,希望读者可以借此更加深入的理解Flink逻辑。
本文跳过了一些基本概念,如果对相关概念感到迷惑,请参考官网文档。另外在本文写作过程中,Flink正式发布了其1.5 RELEASE版本,在其发布之后完成的内容将按照1.5的实现来组织。
1.从 Hello,World WordCount开始
首先,我们把WordCount的例子再放一遍:
public class SocketTextStreamWordCount {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">public</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">static</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">void</span> main(String[] args) <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throws</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Exception {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (args.length != 2<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">){
System.err.println(</span>"USAGE:\nSocketTextStreamWordCount <hostname> <port>"<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">;
}
String hostName </span>= args[0<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">];
Integer port </span>= Integer.parseInt(args[1<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">]);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> set up the execution environment</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> StreamExecutionEnvironment env =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> StreamExecutionEnvironment
.getExecutionEnvironment();
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> get input data</span>
DataStream<String> text =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> env.socketTextStream(hostName, port);
text.flatMap(</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> LineSplitter()).setParallelism(1<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">)
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> group by the tuple field "0" and sum up tuple field "1"</span>
.keyBy(0<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">)
.sum(</span>1).setParallelism(1<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">)
.print();
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> execute program</span>
env.execute("Java WordCount from SocketTextStream Example"<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">/**</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">
* Implements the string tokenizer that splits sentences into words as a user-defined
* FlatMapFunction. The function takes a line (String) and splits it into
* multiple pairs in the form of "(word,1)" (Tuple2&lt;String, Integer&gt;).
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">*/</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">public</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">static</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">class</span> LineSplitter <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">implements</span> FlatMapFunction<String, Tuple2<String, Integer>><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
@Override
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">public</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">void</span> flatMap(String value, Collector<Tuple2<String, Integer>><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> out) {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> normalize and split the line</span>
String[] tokens = value.toLowerCase().split("\\W+"<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> emit the pairs</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">for</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (String token : tokens) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (token.length() > 0<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
out.collect(</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> Tuple2<String, Integer>(token, 1<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">));
}
}
}
}
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div></div><p style="margin:10px auto;">首先从命令行中获取socket对端的ip和端口,然后启动一个执行环境,从socket中读取数据,split成单个单词的流,并按单词进行总和的计数,最后打印出来。这个例子相信接触过大数据计算或者函数式编程的人都能看懂,就不过多解释了。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h3 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:16px;font-family:inherit;line-height:1.5;color:inherit;"><a name="t3"></a>1.1 flink执行环境</h3><p style="margin:10px auto;">程序的启动,从这句开始:<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment()</code>。 <br style="margin:0px;padding:0px;">这行代码会返回一个可用的执行环境。执行环境是整个flink程序执行的上下文,记录了相关配置(如并行度等),并提供了一系列方法,如读取输入流的方法,以及真正开始运行整个代码的execute方法等。对于分布式流处理程序来说,我们在代码中定义的flatMap,keyBy等等操作,事实上可以理解为一种声明,告诉整个程序我们采用了什么样的算子,而真正开启计算的代码不在此处。由于我们是在本地运行flink程序,因此这行代码会返回一个LocalStreamEnvironment,最后我们要调用它的execute方法来开启真正的任务。我们先接着往下看。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h3 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:16px;font-family:inherit;line-height:1.5;color:inherit;"><a name="t4"></a>1.2 算子(Operator)的注册(声明)</h3><p style="margin:10px auto;">我们以flatMap为例,<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">text.flatMap(new LineSplitter())</code>这一句话跟踪进去是这样的:</p><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">public</span> <R> SingleOutputStreamOperator<R> flatMap(FlatMapFunction<T, R><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> flatMapper) {
TypeInformation</span><R> outType =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> TypeExtractor.getFlatMapReturnTypes(clean(flatMapper),
getType(), Utils.getCallLocationName(), </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">true</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span> transform("Flat Map", outType, <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> StreamFlatMap<><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">(clean(flatMapper)));
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p><p style="margin:10px auto;">里面完成了两件事,一是用反射拿到了flatMap算子的输出类型,二是生成了一个Operator。flink流式计算的核心概念,就是将数据从输入流一个个传递给Operator进行链式处理,最后交给输出流的过程。对数据的每一次处理在逻辑上成为一个operator,并且为了本地化处理的效率起见,operator之间也可以串成一个chain一起处理(可以参考责任链模式帮助理解)。下面这张图表明了flink是如何看待用户的处理流程的:抽象化为一系列operator,以source开始,以sink结尾,中间的operator做的操作叫做transform,并且可以把几个操作串在一起执行。 <br style="margin:0px;padding:0px;"><img title="" src="http://static.zybuluo.com/bethunebtj/jal2x1y6zqs4jug4ryqnvu3l/image_1cae39t06eoo3ml1be8o0412c69.png" alt="image_1cae39t06eoo3ml1be8o0412c69.png-43.5kB" style="padding:0px;border:0px;height:auto;max-width:900px;vertical-align:middle;"> <br style="margin:0px;padding:0px;">我们也可以更改flink的设置,要求它不要对某个操作进行chain处理,或者从某个操作开启一个新chain等。 <br style="margin:0px;padding:0px;">上面代码中的最后一行transform方法的作用是返回一个SingleOutputStreamOperator,它继承了Datastream类并且定义了一些辅助方法,方便对流的操作。在返回之前,transform方法还把它注册到了执行环境中(后面生成执行图的时候还会用到它)。其他的操作,包括keyBy,sum和print,都只是不同的算子,在这里出现都是一样的效果,即生成一个operator并注册给执行环境用于生成DAG。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h3 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:16px;font-family:inherit;line-height:1.5;color:inherit;"><a name="t5"></a>1.3 程序的执行</h3><p style="margin:10px auto;">程序执行即<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">env.execute("Java WordCount from SocketTextStream Example")</code>这行代码。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h4 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:14px;font-family:inherit;line-height:20px;color:rgb(51,51,51);">1.3.1 本地模式下的execute方法</h4><p style="margin:10px auto;">这行代码主要做了以下事情:</p><ul style="margin-bottom:10px;margin-left:30px;"><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">生成StreamGraph。代表程序的拓扑结构,是从用户代码直接生成的图。</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">生成JobGraph。这个图是要交给flink去生成task的图。</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">生成一系列配置</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">将JobGraph和配置交给flink集群去运行。如果不是本地运行的话,还会把jar文件通过网络发给其他节点。</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">以本地模式运行的话,可以看到启动过程,如启动性能度量、web模块、JobManager、ResourceManager、taskManager等等</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">启动任务。值得一提的是在启动任务之前,先启动了一个用户类加载器,这个类加载器可以用来做一些在运行时动态加载类的工作。</li></ul><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h4 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:14px;font-family:inherit;line-height:20px;color:rgb(51,51,51);">1.3.2 远程模式(RemoteEnvironment)的execute方法</h4><p style="margin:10px auto;">远程模式的程序执行更加有趣一点。第一步仍然是获取StreamGraph,然后调用executeRemotely方法进行远程执行。 <br style="margin:0px;padding:0px;">该方法首先创建一个用户代码加载器</p><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';">ClassLoader usercodeClassLoader = JobWithJars.buildUserCodeClassLoader(jarFiles, globalClasspaths, getClass().getClassLoader());</pre></div><p style="margin:10px auto;"> </p><p style="margin:10px auto;">然后创建一系列配置,交给Client对象。Client这个词有意思,看见它就知道这里绝对是跟远程集群打交道的客户端。</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> ClusterClient client;
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">try</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
client </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> StandaloneClusterClient(configuration);
client.setPrintStatusDuringExecution(getConfig().isSysoutLoggingEnabled());
}
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">try</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> client.run(streamGraph, jarFiles, globalClasspaths, usercodeClassLoader).getJobExecutionResult();
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">client的run方法首先生成一个JobGraph,然后将其传递给JobClient。关于Client、JobClient、JobManager到底谁管谁,可以看这张图: <br style="margin:0px;padding:0px;"><img title="" src="http://static.zybuluo.com/bethunebtj/6hhl3e1fumlr0aq78d2m35nt/image_1cae7g15p6k94no1ves121c5pd9.png" alt="image_1cae7g15p6k94no1ves121c5pd9.png-19.7kB" style="padding:0px;border:0px;height:auto;max-width:900px;vertical-align:middle;"><br style="margin:0px;padding:0px;">确切的说,JobClient负责以异步的方式和JobManager通信(Actor是scala的异步模块),具体的通信任务由JobClientActor完成。相对应的,JobManager的通信任务也由一个Actor完成。</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"> JobListeningContext jobListeningContext =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> submitJob(
actorSystem,config,highAvailabilityServices,jobGraph,timeout,sysoutLogUpdates, classLoader);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span> awaitJobResult(jobListeningContext);</pre></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">可以看到,该方法阻塞在awaitJobResult方法上,并最终返回了一个JobListeningContext,透过这个Context可以得到程序运行的状态和结果。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h4 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:14px;font-family:inherit;line-height:20px;color:rgb(51,51,51);">1.3.3 程序启动过程</h4><p style="margin:10px auto;">上面提到,整个程序真正意义上开始执行,是这里:</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><ol class="linenums" style="margin-bottom:0px;margin-left:35px;padding-left:40px;color:rgba(102,128,153,.4);"><li class="L0" style="margin:0px 0px 1em;padding:0px 0px 0px 15px;line-height:20px;"><code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;"><span class="pln" style="margin:0px;padding:0px;color:rgb(0,0,0);">env</span><span class="pun" style="margin:0px;padding:0px;color:rgb(102,102,0);">.</span><span class="pln" style="margin:0px;padding:0px;color:rgb(0,0,0);">execute</span><span class="pun" style="margin:0px;padding:0px;color:rgb(102,102,0);">(</span><span class="str" style="margin:0px;padding:0px;color:rgb(0,136,0);">"Java WordCount from SocketTextStream Example"</span><span class="pun" style="margin:0px;padding:0px;color:rgb(102,102,0);">);</span></code></li></ol><p style="margin:10px auto;">远程模式和本地模式有一点不同,我们先按本地模式来调试。 <br style="margin:0px;padding:0px;">我们跟进源码,(在本地调试模式下)会启动一个miniCluster,然后开始执行代码:</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> LocalStreamEnvironment.java</span>
@Override
public JobExecutionResult execute(String jobName)
throws
Exception {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">生成各种图结构</span>
…
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">try</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">启动集群,包括启动JobMaster,进行leader选举等等</span>
miniCluster.start(); configuration.setInteger(RestOptions.PORT, miniCluster.getRestAddress().getPort());
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">提交任务到JobMaster</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> miniCluster.executeJobBlocking(jobGraph);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">finally</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
transformations.clear();
miniCluster.close();
}
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">这个方法里有一部分逻辑是与生成图结构相关的,我们放在第二章里讲;现在我们先接着往里跟:</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">MiniCluster.java</span>
public JobExecutionResult executeJobBlocking(JobGraph job) throws JobExecutionException, InterruptedException { checkNotNull(job, “job is null”);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">在这里,最终把job提交给了jobMaster</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> CompletableFuture<JobSubmissionResult> submissionFuture =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> submitJob(job);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> CompletableFuture<JobResult> jobResultFuture =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> submissionFuture.thenCompose(
(JobSubmissionResult ignored) </span>-><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> requestJobResult(job.getJobID()));
......
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">正如我在注释里写的,这一段代码核心逻辑就是调用那个<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">submitJob</code>方法。那么我们再接着看这个方法:</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">public</span> CompletableFuture<JobSubmissionResult><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> submitJob(JobGraph jobGraph) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> DispatcherGateway dispatcherGateway;
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">try</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
dispatcherGateway </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> getDispatcherGateway();
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">catch</span> (LeaderRetrievalException |<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> InterruptedException e) {
ExceptionUtils.checkInterrupted(e);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> FutureUtils.completedExceptionally(e);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> we have to allow queued scheduling in Flip-6 mode because we need to request slots
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> from the ResourceManager</span>
jobGraph.setAllowQueuedScheduling(<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">true</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> CompletableFuture<Void> jarUploadFuture =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> uploadAndSetJarFiles(dispatcherGateway, jobGraph);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> CompletableFuture<Acknowledge> acknowledgeCompletableFuture =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> jarUploadFuture.thenCompose(
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">在这里执行了真正的submit操作</span>
(Void ack) -><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> dispatcherGateway.submitJob(jobGraph, rpcTimeout));
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> acknowledgeCompletableFuture.thenApply(
(Acknowledge ignored) </span>-> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> JobSubmissionResult(jobGraph.getJobID()));
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">这里的<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">Dispatcher</code>是一个接收job,然后指派JobMaster去启动任务的类,我们可以看看它的类结构,有两个实现。在本地环境下启动的是<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">MiniDispatcher</code>,在集群上提交任务时,集群上启动的是<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">StandaloneDispatcher</code>。 <br style="margin:0px;padding:0px;"><img title="" src="http://static.zybuluo.com/bethunebtj/y9hjeinc58dqc7wiepv2iim4/image_1cenfj3p9fp110p0a8unn1mrh9.png" alt="image_1cenfj3p9fp110p0a8unn1mrh9.png-27.4kB" style="padding:0px;border:0px;height:auto;max-width:900px;vertical-align:middle;"></p><p style="margin:10px auto;">那么这个Dispatcher又做了什么呢?它启动了一个<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">JobManagerRunner</code>(这里我要吐槽Flink的命名,这个东西应该叫做JobMasterRunner才对,flink里的JobMaster和JobManager不是一个东西),委托JobManagerRunner去启动该Job的<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">JobMaster</code>。我们看一下对应的代码:</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">jobManagerRunner.java</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">private</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">void</span> verifyJobSchedulingStatusAndStartJobManager(UUID leaderSessionId) <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throws</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Exception {
......
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> CompletableFuture<Acknowledge> startFuture = jobMaster.start(<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> JobMasterId(leaderSessionId), rpcTimeout);
......
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">然后,JobMaster经过了一堆方法嵌套之后,执行到了这里:</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">private</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">void</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> scheduleExecutionGraph() {
checkState(jobStatusListener </span>== <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> register self as job status change listener</span>
jobStatusListener = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> JobManagerJobStatusListener();
executionGraph.registerJobStatusListener(jobStatusListener);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">try</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">这里调用了ExecutionGraph的启动方法</span>
executionGraph.scheduleForExecution(); } catch (Throwable t) { executionGraph.failGlobal(t); } }
我们知道,flink的框架里有三层图结构,其中ExecutionGraph就是真正被执行的那一层,所以到这里为止,一个任务从提交到真正执行的流程就走完了,我们再回顾一下(顺便提一下远程提交时的流程区别):
客户端代码的execute方法执行; 本地环境下,MiniCluster完成了大部分任务,直接把任务委派给了MiniDispatcher; 远程环境下,启动了一个RestClusterClient
,这个类会以HTTP Rest的方式把用户代码提交到集群上; 远程环境下,请求发到集群上之后,必然有个handler去处理,在这里是JobSubmitHandler
。这个类接手了请求后,委派StandaloneDispatcher启动job,到这里之后,本地提交和远程提交的逻辑往后又统一了; Dispatcher接手job之后,会实例化一个JobManagerRunner
,然后用这个runner启动job; JobManagerRunner接下来把job交给了JobMaster
去处理; JobMaster使用ExecutionGraph
的方法启动了整个执行图;整个任务就启动起来了。
至此,第一部分就讲完了。
2.理解flink的图结构
第一部分讲到,我们的主函数最后一项任务就是生成StreamGraph,然后生成JobGraph,然后以此开始调度任务运行,所以接下来我们从这里入手,继续探索flink。
2.1 flink的三层图结构
事实上,flink总共提供了三种图的抽象,我们前面已经提到了StreamGraph和JobGraph,还有一种是ExecutionGraph,是用于调度的基本数据结构。 上面这张图清晰的给出了flink各个图的工作原理和转换过程。其中最后一个物理执行图并非flink的数据结构,而是程序开始执行后,各个task分布在不同的节点上,所形成的物理上的关系表示。
从JobGraph的图里可以看到,数据从上一个operator流到下一个operator的过程中,上游作为生产者提供了IntermediateDataSet,而下游作为消费者需要JobEdge。事实上,JobEdge是一个通信管道,连接了上游生产的dataset和下游的JobVertex节点。 在JobGraph转换到ExecutionGraph的过程中,主要发生了以下转变:
加入了并行度的概念,成为真正可调度的图结构 生成了与JobVertex对应的ExecutionJobVertex,ExecutionVertex,与IntermediateDataSet对应的IntermediateResult和IntermediateResultPartition等,并行将通过这些类实现 ExecutionGraph已经可以用于调度任务。我们可以看到,flink根据该图生成了一一对应的Task,每个task对应一个ExecutionGraph的一个Execution。Task用InputGate、InputChannel和ResultPartition对应了上面图中的IntermediateResult和ExecutionEdge。
那么,flink抽象出这三层图结构,四层执行逻辑的意义是什么呢? StreamGraph是对用户逻辑的映射。JobGraph在此基础上进行了一些优化,比如把一部分操作串成chain以提高效率。ExecutionGraph是为了调度存在的,加入了并行处理的概念。而在此基础上真正执行的是Task及其相关结构。
2.2 StreamGraph的生成
在第一节的算子注册部分,我们可以看到,flink把每一个算子transform成一个对流的转换(比如上文中返回的SingleOutputStreamOperator是一个DataStream的子类),并且注册到执行环境中,用于生成StreamGraph。实际生成StreamGraph的入口是StreamGraphGenerator.generate(env, transformations)
其中的transformations是一个list,里面记录的就是我们在transform方法中放进来的算子。
2.2.1 StreamTransformation类代表了流的转换
StreamTransformation代表了从一个或多个DataStream生成新DataStream的操作。顺便,DataStream类在内部组合了一个StreamTransformation类,实际的转换操作均通过该类完成。 我们可以看到,从source到各种map,union再到sink操作全部被映射成了StreamTransformation。 其映射过程如下所示:
以MapFunction为例:
2.2.2 StreamGraph生成函数分析
我们从StreamGraphGenerator.generate()方法往下看:
public static StreamGraph generate(StreamExecutionEnvironment env, List<StreamTransformation<?>> transformations) {
return new StreamGraphGenerator(env).generateInternal(transformations);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">注意,StreamGraph的生成是从sink开始的</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">private</span> StreamGraph generateInternal(List<StreamTransformation<?>><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> transformations) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">for</span> (StreamTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> transformation: transformations) {
transform(transformation);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> streamGraph;
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">这个方法的核心逻辑就是判断传入的steamOperator是哪种类型,并执行相应的操作,详情见下面那一大堆if-else</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">private</span> Collection<Integer> transform(StreamTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> transform) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (alreadyTransformed.containsKey(transform)) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> alreadyTransformed.get(transform);
}
LOG.debug(</span>"Transforming " +<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> transform);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform.getMaxParallelism() <= 0<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> if the max parallelism hasn't been set, then first use the job wide max parallelism
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> from theExecutionConfig.</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">int</span> globalMaxParallelismFromConfig =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> env.getConfig().getMaxParallelism();
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (globalMaxParallelismFromConfig > 0<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
transform.setMaxParallelism(globalMaxParallelismFromConfig);
}
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> call at least once to trigger exceptions about MissingTypeInfo</span>
transform.getOutputType();
Collection</span><Integer><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> transformedIds;
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">这里对操作符的类型进行判断,并以此调用相应的处理逻辑.简而言之,处理的核心无非是递归的将该节点和节点的上游节点加入图</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">instanceof</span> OneInputTransformation<?, ?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
transformedIds </span>= transformOneInputTransform((OneInputTransformation<?, ?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) transform);
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">instanceof</span> TwoInputTransformation<?, ?, ?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
transformedIds </span>= transformTwoInputTransform((TwoInputTransformation<?, ?, ?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) transform);
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">instanceof</span> SourceTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
transformedIds </span>= transformSource((SourceTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) transform);
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">instanceof</span> SinkTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
transformedIds </span>= transformSink((SinkTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) transform);
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">instanceof</span> UnionTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
transformedIds </span>= transformUnion((UnionTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) transform);
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">instanceof</span> SplitTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
transformedIds </span>= transformSplit((SplitTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) transform);
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">instanceof</span> SelectTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
transformedIds </span>= transformSelect((SelectTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) transform);
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">instanceof</span> FeedbackTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
transformedIds </span>= transformFeedback((FeedbackTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) transform);
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">instanceof</span> CoFeedbackTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
transformedIds </span>= transformCoFeedback((CoFeedbackTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) transform);
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">instanceof</span> PartitionTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
transformedIds </span>= transformPartition((PartitionTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) transform);
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">instanceof</span> SideOutputTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
transformedIds </span>= transformSideOutput((SideOutputTransformation<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) transform);
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throw</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> IllegalStateException("Unknown transformation: " +<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> transform);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">注意这里和函数开始时的方法相对应,在有向图中要注意避免循环的产生
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> need this check because the iterate transformation adds itself before
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> transforming the feedback edges</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (!<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">alreadyTransformed.containsKey(transform)) {
alreadyTransformed.put(transform, transformedIds);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform.getBufferTimeout() > 0<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
streamGraph.setBufferTimeout(transform.getId(), transform.getBufferTimeout());
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform.getUid() != <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
streamGraph.setTransformationUID(transform.getId(), transform.getUid());
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform.getUserProvidedNodeHash() != <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
streamGraph.setTransformationUserHash(transform.getId(), transform.getUserProvidedNodeHash());
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform.getMinResources() != <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span> && transform.getPreferredResources() != <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
streamGraph.setResources(transform.getId(), transform.getMinResources(), transform.getPreferredResources());
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> transformedIds;
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">因为map,filter等常用操作都是OneInputStreamOperator,我们就来看看<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">transformOneInputTransform((OneInputTransformation<?, ?>) transform)</code>方法。</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">private</span> <IN, OUT> Collection<Integer> transformOneInputTransform(OneInputTransformation<IN, OUT><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> transform) {
Collection</span><Integer> inputIds =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> transform(transform.getInput());
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> 在递归处理节点过程中,某个节点可能已经被其他子节点先处理过了,需要跳过</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (alreadyTransformed.containsKey(transform)) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> alreadyTransformed.get(transform);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">这里是获取slotSharingGroup。这个group用来定义当前我们在处理的这个操作符可以跟什么操作符chain到一个slot里进行操作
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">因为有时候我们可能不满意flink替我们做的chain聚合
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">一个slot就是一个执行task的基本容器</span>
String slotSharingGroup =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> determineSlotSharingGroup(transform.getSlotSharingGroup(), inputIds);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">把该operator加入图</span>
streamGraph.addOperator(transform.getId(), slotSharingGroup, transform.getOperator(), transform.getInputType(), transform.getOutputType(), transform.getName());
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">对于keyedStream,我们还要记录它的keySelector方法
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">flink并不真正为每个keyedStream保存一个key,而是每次需要用到key的时候都使用keySelector方法进行计算
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">因此,我们自定义的keySelector方法需要保证幂等性
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">到后面介绍keyGroup的时候我们还会再次提到这一点</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (transform.getStateKeySelector() != <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
TypeSerializer</span><?> keySerializer =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> transform.getStateKeyType().createSerializer(env.getConfig());
streamGraph.setOneInputStateKey(transform.getId(), transform.getStateKeySelector(), keySerializer);
}
streamGraph.setParallelism(transform.getId(), transform.getParallelism());
streamGraph.setMaxParallelism(transform.getId(), transform.getMaxParallelism());
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">为当前节点和它的依赖节点建立边
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">这里可以看到之前提到的select union partition等逻辑节点被合并入edge的过程</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">for</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (Integer inputId: inputIds) {
streamGraph.addEdge(inputId, transform.getId(), </span>0<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Collections.singleton(transform.getId());
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">public</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">void</span> addEdge(Integer upStreamVertexID, Integer downStreamVertexID, <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">int</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> typeNumber) {
addEdgeInternal(upStreamVertexID,
downStreamVertexID,
typeNumber,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> ArrayList<String><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">(),
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">addEdge的实现,会合并一些逻辑节点</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">private</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">void</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> addEdgeInternal(Integer upStreamVertexID,
Integer downStreamVertexID,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">int</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> typeNumber,
StreamPartitioner</span><?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> partitioner,
List</span><String><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> outputNames,
OutputTag outputTag) {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">如果输入边是侧输出节点,则把side的输入边作为本节点的输入边,并递归调用</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (virtualSideOutputNodes.containsKey(upStreamVertexID)) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">int</span> virtualId =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> upStreamVertexID;
upStreamVertexID </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> virtualSideOutputNodes.get(virtualId).f0;
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (outputTag == <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
outputTag </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> virtualSideOutputNodes.get(virtualId).f1;
}
addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">, outputTag);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">如果输入边是select,则把select的输入边作为本节点的输入边</span>
} <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (virtualSelectNodes.containsKey(upStreamVertexID)) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">int</span> virtualId =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> upStreamVertexID;
upStreamVertexID </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> virtualSelectNodes.get(virtualId).f0;
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (outputNames.isEmpty()) {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> selections that happen downstream override earlier selections</span>
outputNames =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> virtualSelectNodes.get(virtualId).f1;
}
addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, outputNames, outputTag);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">如果是partition节点</span>
} <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (virtualPartitionNodes.containsKey(upStreamVertexID)) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">int</span> virtualId =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> upStreamVertexID;
upStreamVertexID </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> virtualPartitionNodes.get(virtualId).f0;
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (partitioner == <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
partitioner </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> virtualPartitionNodes.get(virtualId).f1;
}
addEdgeInternal(upStreamVertexID, downStreamVertexID, typeNumber, partitioner, outputNames, outputTag);
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">正常的edge处理逻辑</span>
StreamNode upstreamNode =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> getStreamNode(upStreamVertexID);
StreamNode downstreamNode </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> getStreamNode(downStreamVertexID);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> If no partitioner was specified and the parallelism of upstream and downstream
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> operator matches use forward partitioning, use rebalance otherwise.</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (partitioner == <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span> && upstreamNode.getParallelism() ==<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> downstreamNode.getParallelism()) {
partitioner </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> ForwardPartitioner<Object><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">();
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (partitioner == <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
partitioner </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> RebalancePartitioner<Object><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">();
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (partitioner <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">instanceof</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> ForwardPartitioner) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (upstreamNode.getParallelism() !=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> downstreamNode.getParallelism()) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throw</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> UnsupportedOperationException("Forward partitioning does not allow " +
"change of parallelism. Upstream operation: " + upstreamNode + " parallelism: " + upstreamNode.getParallelism() +
", downstream operation: " + downstreamNode + " parallelism: " + downstreamNode.getParallelism() +
" You must use another partitioning strategy, such as broadcast, rebalance, shuffle or global."<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
}
}
StreamEdge edge </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> StreamEdge(upstreamNode, downstreamNode, typeNumber, outputNames, partitioner, outputTag);
getStreamNode(edge.getSourceId()).addOutEdge(edge);
getStreamNode(edge.getTargetId()).addInEdge(edge);
}
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h4 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:14px;font-family:inherit;line-height:20px;color:rgb(51,51,51);">2.2.3 WordCount函数的StreamGraph</h4><p style="margin:10px auto;">flink提供了一个StreamGraph可视化显示工具,<a href="http://flink.apache.org/visualizer/" rel="nofollow" style="margin:0px;padding:0px;color:#000000;" target="_blank">在这里</a> <br style="margin:0px;padding:0px;">我们可以把我们的程序的执行计划打印出来<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">System.out.println(env.getExecutionPlan());</code> 复制到这个网站上,点击生成,如图所示: <br style="margin:0px;padding:0px;"><img title="" src="http://static.zybuluo.com/bethunebtj/sfckex3xgu33m3srk2bc5hgk/image_1cafgsliu1n2n1uj21p971b0h6m71t.png" alt="image_1cafgsliu1n2n1uj21p971b0h6m71t.png-25.7kB" style="padding:0px;border:0px;height:auto;max-width:900px;vertical-align:middle;"><br style="margin:0px;padding:0px;">可以看到,我们源程序被转化成了4个operator。 <br style="margin:0px;padding:0px;">另外,在operator之间的连线上也显示出了flink添加的一些逻辑流程。由于我设定了每个操作符的并行度都是1,所以在每个操作符之间都是直接FORWARD,不存在shuffle的过程。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h3 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:16px;font-family:inherit;line-height:1.5;color:inherit;"><a name="t9"></a>2.3 JobGraph的生成</h3><p style="margin:10px auto;">flink会根据上一步生成的StreamGraph生成JobGraph,然后将JobGraph发送到server端进行ExecutionGraph的解析。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h4 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:14px;font-family:inherit;line-height:20px;color:rgb(51,51,51);">2.3.1 JobGraph生成源码</h4><p style="margin:10px auto;">与StreamGraph类似,JobGraph的入口方法是<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">StreamingJobGraphGenerator.createJobGraph()</code>。我们直接来看源码</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">private</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> JobGraph createJobGraph() {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> 设置启动模式为所有节点均在一开始就启动</span>
jobGraph.setScheduleMode(ScheduleMode.EAGER);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> 为每个节点生成hash id</span>
Map<Integer, <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">byte</span>[]> hashes =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> defaultStreamGraphHasher.traverseStreamGraphAndGenerateHashes(streamGraph);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> 为了保持兼容性创建的hash</span>
List<Map<Integer, <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">byte</span>[]>> legacyHashes = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> ArrayList<><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">(legacyStreamGraphHashers.size());
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">for</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (StreamGraphHasher hasher : legacyStreamGraphHashers) {
legacyHashes.add(hasher.traverseStreamGraphAndGenerateHashes(streamGraph));
}
Map</span><Integer, List<Tuple2<<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">byte</span>[], <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">byte</span>[]>>> chainedOperatorHashes = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> HashMap<><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">();
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">生成jobvertex,串成chain等
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">这里的逻辑大致可以理解为,挨个遍历节点,如果该节点是一个chain的头节点,就生成一个JobVertex,如果不是头节点,就要把自身配置并入头节点,然后把头节点和自己的出边相连;对于不能chain的节点,当作只有头节点处理即可</span>
setChaining(hashes, legacyHashes, chainedOperatorHashes); //设置输入边edge setPhysicalEdges(); //设置slot共享group setSlotSharing(); //配置检查点 configureCheckpointing();
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> 如果有之前的缓存文件的配置的话,重新读入</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">for</span> (Tuple2<String, DistributedCache.DistributedCacheEntry><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> e : streamGraph.getEnvironment().getCachedFiles()) {
DistributedCache.writeFileInfoToConfig(e.f0, e.f1, jobGraph.getJobConfiguration());
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> 传递执行环境配置</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">try</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
jobGraph.setExecutionConfig(streamGraph.getExecutionConfig());
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">catch</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (IOException e) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throw</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> IllegalConfigurationException("Could not serialize the ExecutionConfig." +
"This indicates that non-serializable types (like custom serializers) were registered"<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> jobGraph;
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><h4 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:14px;font-family:inherit;line-height:20px;color:rgb(51,51,51);">2.3.2 operator chain的逻辑</h4><blockquote class="white-blockquote" style="margin-bottom:20px;padding:10px 15px;background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border-width:2px 2px 2px 10px;border-style:solid;border-color:rgb(214,219,223);line-height:1.6;color:rgb(51,51,51);"><p style="margin:10px auto;line-height:25px;">为了更高效地分布式执行,Flink会尽可能地将operator的subtask链接(chain)在一起形成task。每个task在一个线程中执行。将operators链接成task是非常有效的优化:它能减少线程之间的切换,减少消息的序列化/反序列化,减少数据在缓冲区的交换,减少了延迟的同时提高整体的吞吐量。</p></blockquote><p style="margin:10px auto;"><img title="" src="http://static.zybuluo.com/bethunebtj/jcjalvv130ex52vkglkt56r2/image_1cafj7s6bittk5tt0bequlig2a.png" alt="image_1cafj7s6bittk5tt0bequlig2a.png-158.7kB" style="padding:0px;border:0px;height:auto;max-width:900px;vertical-align:middle;"><br style="margin:0px;padding:0px;">上图中将KeyAggregation和Sink两个operator进行了合并,因为这两个合并后并不会改变整体的拓扑结构。但是,并不是任意两个 operator 就能 chain 一起的,其条件还是很苛刻的:</p><blockquote class="white-blockquote" style="margin-bottom:20px;padding:10px 15px;background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border-width:2px 2px 2px 10px;border-style:solid;border-color:rgb(214,219,223);line-height:1.6;color:rgb(51,51,51);"><ul style="margin-bottom:10px;margin-left:25px;"><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">上下游的并行度一致</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">下游节点的入度为1 (也就是说下游节点没有来自其他节点的输入)</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">上下游节点都在同一个 slot group 中(下面会解释 slot group)</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">下游节点的 chain 策略为 ALWAYS(可以与上下游链接,map、flatmap、filter等默认是ALWAYS)</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">上游节点的 chain 策略为 ALWAYS 或 HEAD(只能与下游链接,不能与上游链接,Source默认是HEAD)</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">两个节点间数据分区方式是 forward(参考理解数据流的分区)</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">用户没有禁用 chain</li></ul></blockquote><p style="margin:10px auto;">flink的chain逻辑是一种很常见的设计,比如spring的interceptor也是类似的实现方式。通过把操作符串成一个大操作符,flink避免了把数据序列化后通过网络发送给其他节点的开销,能够大大增强效率。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h4 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:14px;font-family:inherit;line-height:20px;color:rgb(51,51,51);">2.3.3 JobGraph的提交</h4><p style="margin:10px auto;">前面已经提到,JobGraph的提交依赖于JobClient和JobManager之间的异步通信,如图所示: <br style="margin:0px;padding:0px;"><img title="" src="http://static.zybuluo.com/bethunebtj/dj015uuqpnb4ct7810qfilhe/image_1cafn516r1p68kt31g7r196rcsv2n.png" alt="image_1cafn516r1p68kt31g7r196rcsv2n.png-40.1kB" style="padding:0px;border:0px;height:auto;max-width:900px;vertical-align:middle;"><br style="margin:0px;padding:0px;">在submitJobAndWait方法中,其首先会创建一个JobClientActor的ActorRef,然后向其发起一个SubmitJobAndWait消息,该消息将JobGraph的实例提交给JobClientActor。发起模式是ask,它表示需要一个应答消息。</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';">Future<Object> future = Patterns.ask(jobClientActor, <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> JobClientMessages.SubmitJobAndWait(jobGraph), <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Timeout(AkkaUtils.INF_TIMEOUT()));
answer = Await.result(future, AkkaUtils.INF_TIMEOUT());
该SubmitJobAndWait消息被JobClientActor接收后,最终通过调用tryToSubmitJob方法触发真正的提交动作。当JobManager的actor接收到来自client端的请求后,会执行一个submitJob方法,主要做以下事情:
向BlobLibraryCacheManager注册该Job; 构建ExecutionGraph对象; 对JobGraph中的每个顶点进行初始化; 将DAG拓扑中从source开始排序,排序后的顶点集合附加到Exec> - utionGraph对象; 获取检查点相关的配置,并将其设置到ExecutionGraph对象; 向ExecutionGraph注册相关的listener; 执行恢复操作或者将JobGraph信息写入SubmittedJobGraphStore以在后续用于恢复目的; 响应给客户端JobSubmitSuccess消息; 对ExecutionGraph对象进行调度执行;
最后,JobManger会返回消息给JobClient,通知该任务是否提交成功。
2.4 ExecutionGraph的生成
与StreamGraph和JobGraph不同,ExecutionGraph并不是在我们的客户端程序生成,而是在服务端(JobManager处)生成的,顺便flink只维护一个JobManager。其入口代码是ExecutionGraphBuilder.buildGraph(…)
该方法长200多行,其中一大半是checkpoiont的相关逻辑,我们暂且略过,直接看核心方法executionGraph.attachJobGraph(sortedTopology)
因为ExecutionGraph事实上只是改动了JobGraph的每个节点,而没有对整个拓扑结构进行变动,所以代码里只是挨个遍历jobVertex并进行处理:
for (JobVertex jobVertex : topologiallySorted) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (jobVertex.isInputVertex() && !<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">jobVertex.isStoppable()) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span>.isStoppable = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">false</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">;
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">在这里生成ExecutionGraph的每个节点
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">首先是进行了一堆赋值,将任务信息交给要生成的图节点,以及设定并行度等等
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">然后是创建本节点的IntermediateResult,根据本节点的下游节点的个数确定创建几份
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">最后是根据设定好的并行度创建用于执行task的ExecutionVertex
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">如果job有设定inputsplit的话,这里还要指定inputsplits</span>
ExecutionJobVertex ejv = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> ExecutionJobVertex(
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">,
jobVertex,
</span>1<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">,
rpcCallTimeout,
globalModVersion,
createTimestamp);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">这里要处理所有的JobEdge
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">对每个edge,获取对应的intermediateResult,并记录到本节点的输入上
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">最后,把每个ExecutorVertex和对应的IntermediateResult关联起来</span>
ejv.connectToPredecessors(<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">.intermediateResults);
ExecutionJobVertex previousTask </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">.tasks.putIfAbsent(jobVertex.getID(), ejv);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (previousTask != <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throw</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> JobException(String.format("Encountered two job vertices with ID %s : previous=[%s] / new=[%s]"<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">,
jobVertex.getID(), ejv, previousTask));
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">for</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (IntermediateResult res : ejv.getProducedDataSets()) {
IntermediateResult previousDataSet </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">.intermediateResults.putIfAbsent(res.getId(), res);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (previousDataSet != <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throw</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> JobException(String.format("Encountered two intermediate data set with ID %s : previous=[%s] / new=[%s]"<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">,
res.getId(), res, previousDataSet));
}
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">.verticesInCreationOrder.add(ejv);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span>.numVerticesTotal +=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> ejv.getParallelism();
newExecJobVertices.add(ejv);
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">至此,ExecutorGraph就创建完成了。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h2 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:21px;font-family:inherit;line-height:1.5;color:inherit;"><a name="t11"></a>3. 任务的调度与执行</h2><p style="margin:10px auto;">关于flink的任务执行架构,官网的这两张图就是最好的说明: <br style="margin:0px;padding:0px;"><img src="http://static.zybuluo.com/bethunebtj/qiv2wip1rok62ljo0tef3qf0/image_1cafnu1pl1d8c15m219b8vkb2334.png" alt="image_1cafnu1pl1d8c15m219b8vkb2334.png-112.9kB" style="padding:0px;border:0px;height:auto;max-width:900px;vertical-align:middle;"><br style="margin:0px;padding:0px;">Flink 集群启动后,首先会启动一个 JobManger 和多个的 TaskManager。用户的代码会由JobClient 提交给 JobManager,JobManager 再把来自不同用户的任务发给 不同的TaskManager 去执行,每个TaskManager管理着多个task,task是执行计算的最小结构, TaskManager 将心跳和统计信息汇报给 JobManager。TaskManager 之间以流的形式进行数据的传输。上述除了task外的三者均为独立的 JVM 进程。 <br style="margin:0px;padding:0px;">要注意的是,TaskManager和job并非一一对应的关系。flink调度的最小单元是task而非TaskManager,也就是说,来自不同job的不同task可能运行于同一个TaskManager的不同线程上。 <br style="margin:0px;padding:0px;"><img title="" src="http://static.zybuluo.com/bethunebtj/b7cmjn41b1zp5sco34kgvusn/image_1cclle7ui2j41nf611gs1is18m19.png" alt="image_1cclle7ui2j41nf611gs1is18m19.png-127.5kB" style="padding:0px;border:0px;height:auto;max-width:900px;vertical-align:middle;"><br style="margin:0px;padding:0px;">一个flink任务所有可能的状态如上图所示。图上画的很明白,就不再赘述了。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h3 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:16px;font-family:inherit;line-height:1.5;color:inherit;"><a name="t12"></a>3.1 计算资源的调度</h3><p style="margin:10px auto;">Task slot是一个TaskManager内资源分配的最小载体,代表了一个固定大小的资源子集,每个TaskManager会将其所占有的资源平分给它的slot。 <br style="margin:0px;padding:0px;">通过调整 task slot 的数量,用户可以定义task之间是如何相互隔离的。每个 TaskManager 有一个slot,也就意味着每个task运行在独立的 JVM 中。每个 TaskManager 有多个slot的话,也就是说多个task运行在同一个JVM中。 <br style="margin:0px;padding:0px;">而在同一个JVM进程中的task,可以共享TCP连接(基于多路复用)和心跳消息,可以减少数据的网络传输,也能共享一些数据结构,一定程度上减少了每个task的消耗。 <br style="margin:0px;padding:0px;">每个slot可以接受单个task,也可以接受多个连续task组成的pipeline,如下图所示,FlatMap函数占用一个taskslot,而key Agg函数和sink函数共用一个taskslot: <br style="margin:0px;padding:0px;"><img title="" src="http://static.zybuluo.com/bethunebtj/6ypu9v09z0mit936uk0mcddi/image_1cafpf21c1jh3s5ap1fisu4v23h.png" alt="image_1cafpf21c1jh3s5ap1fisu4v23h.png-44.7kB" style="padding:0px;border:0px;height:auto;max-width:900px;vertical-align:middle;"><br style="margin:0px;padding:0px;">为了达到共用slot的目的,除了可以以chain的方式pipeline算子,我们还可以允许SlotSharingGroup,如下图所示: <br style="margin:0px;padding:0px;"><img title="" src="http://static.zybuluo.com/bethunebtj/tgamd7vw9qcdttvihlmvhie9/image_1cafpko68b3r1lk0dpsnmbj3c3u.png" alt="image_1cafpko68b3r1lk0dpsnmbj3c3u.png-61.2kB" style="padding:0px;border:0px;height:auto;max-width:900px;vertical-align:middle;"><br style="margin:0px;padding:0px;">我们可以把不能被chain成一条的两个操作如flatmap和key&sink放在一个TaskSlot里执行,这样做可以获得以下好处:</p><ul style="margin-bottom:10px;margin-left:30px;"><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">共用slot使得我们不再需要计算每个任务需要的总task数目,直接取最高算子的并行度即可</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">对计算资源的利用率更高。例如,通常的轻量级操作map和重量级操作Aggregate不再分别需要一个线程,而是可以在同一个线程内执行,而且对于slot有限的场景,我们可以增大每个task的并行度了。 <br style="margin:0px;padding:0px;">接下来我们还是用官网的图来说明flink是如何重用slot的: <br style="margin:0px;padding:0px;"><img title="" src="http://static.zybuluo.com/bethunebtj/l0n9ny2y198x0daucmyo0zb4/image_1cafqroarkjkuje1hfi18gor654b.png" alt="image_1cafqroarkjkuje1hfi18gor654b.png-137kB" style="margin-bottom:0px;padding:0px;border:0px;height:auto;max-width:900px;vertical-align:middle;"><br style="margin:0px;padding:0px;"><ol style="margin-bottom:0px;margin-left:25px;padding-left:40px;"><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;list-style:disc;line-height:27px;">TaskManager1分配一个SharedSlot0</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;list-style:disc;line-height:27px;">把source task放入一个SimpleSlot0,再把该slot放入SharedSlot0</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;list-style:disc;line-height:27px;">把flatmap task放入一个SimpleSlot1,再把该slot放入SharedSlot0</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;list-style:disc;line-height:27px;">因为我们的flatmap task并行度是2,因此不能再放入SharedSlot0,所以向TaskMange21申请了一个新的SharedSlot0</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;list-style:disc;line-height:27px;">把第二个flatmap task放进一个新的SimpleSlot,并放进TaskManager2的SharedSlot0</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;list-style:disc;line-height:27px;">开始处理key&sink task,因为其并行度也是2,所以先把第一个task放进TaskManager1的SharedSlot</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;list-style:disc;line-height:27px;">把第二个key&sink放进TaskManager2的SharedSlot</li></ol></li></ul><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h3 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:16px;font-family:inherit;line-height:1.5;color:inherit;"><a name="t13"></a>3.2 JobManager执行job</h3><p style="margin:10px auto;">JobManager负责接收 flink 的作业,调度 task,收集 job 的状态、管理 TaskManagers。被实现为一个 akka actor。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h4 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:14px;font-family:inherit;line-height:20px;color:rgb(51,51,51);">3.2.1 JobManager的组件</h4><ul style="margin-bottom:10px;margin-left:30px;"><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">BlobServer 是一个用来管理二进制大文件的服务,比如保存用户上传的jar文件,该服务会将其写到磁盘上。还有一些相关的类,如BlobCache,用于TaskManager向JobManager下载用户的jar文件</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">InstanceManager 用来管理当前存活的TaskManager的组件,记录了TaskManager的心跳信息等</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">CompletedCheckpointStore 用于保存已完成的checkpoint相关信息,持久化到内存中或者zookeeper上</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">MemoryArchivist 保存了已经提交到flink的作业的相关信息,如JobGraph等</li></ul><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h4 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:14px;font-family:inherit;line-height:20px;color:rgb(51,51,51);">3.2.2 JobManager的启动过程</h4><p style="margin:10px auto;">先列出JobManager启动的核心代码</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">def runJobManager(
configuration: Configuration,
executionMode: JobManagerMode,
listeningAddress: String,
listeningPort: Int)
: Unit </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
val numberProcessors </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Hardware.getNumberCPUCores()
val futureExecutor </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Executors.newScheduledThreadPool(
numberProcessors,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> ExecutorThreadFactory("jobmanager-future"<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">))
val ioExecutor </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Executors.newFixedThreadPool(
numberProcessors,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> ExecutorThreadFactory("jobmanager-io"<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">))
val timeout </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> AkkaUtils.getTimeout(configuration)
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> we have to first start the JobManager ActorSystem because this determines the port if 0
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> was chosen before. The method startActorSystem will update the configuration correspondingly.</span>
val jobManagerSystem =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> startActorSystem(
configuration,
listeningAddress,
listeningPort)
val highAvailabilityServices </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> HighAvailabilityServicesUtils.createHighAvailabilityServices(
configuration,
ioExecutor,
AddressResolution.NO_ADDRESS_RESOLUTION)
val metricRegistry </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> MetricRegistryImpl(
MetricRegistryConfiguration.fromConfiguration(configuration))
metricRegistry.startQueryService(jobManagerSystem, </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">)
val (_, _, webMonitorOption, _) </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">try</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
startJobManagerActors(
jobManagerSystem,
configuration,
executionMode,
listeningAddress,
futureExecutor,
ioExecutor,
highAvailabilityServices,
metricRegistry,
classOf[JobManager],
classOf[MemoryArchivist],
Option(classOf[StandaloneResourceManager])
)
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">catch</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">case</span> t: Throwable =><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">
futureExecutor.shutdownNow()
ioExecutor.shutdownNow()
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throw</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> t
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> block until everything is shut down</span>
jobManagerSystem.awaitTermination()
.......
}
配置Akka并生成ActorSystem,启动JobManager 启动HA和metric相关服务 在startJobManagerActors()
方法中启动JobManagerActors,以及webserver,TaskManagerActor,ResourceManager等等 阻塞等待终止 集群通过LeaderService等选出JobManager的leader
3.2.3 JobManager启动Task
JobManager 是一个Actor,通过各种消息来完成核心逻辑:
override def handleMessage: Receive = {
case GrantLeadership(newLeaderSessionID) =>
log.info (s"JobManager
g
e
t
A
d
d
r
e
s
s
w
a
s
g
r
a
n
t
e
d
l
e
a
d
e
r
s
h
i
p
w
i
t
h
l
e
a
d
e
r
s
e
s
s
i
o
n
I
D
"
+
<
s
p
a
n
s
t
y
l
e
=
"
m
a
r
g
i
n
:
0
p
x
;
p
a
d
d
i
n
g
:
0
p
x
;
c
o
l
o
r
:
r
g
b
(
0
,
0
,
0
)
;
f
o
n
t
−
s
i
z
e
:
12
p
x
;
l
i
n
e
−
h
e
i
g
h
t
:
1.5
;
"
>
s
<
/
s
p
a
n
>
"
getAddress was granted leadership with leader session ID " +<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> s</span>"
g e t A d d r e s s w a s g r a n t e d l e a d e r s h i p w i t h l e a d e r s e s s i o n I D " + < s p a n s t y l e = " m a r g i n : 0 p x ; p a d d i n g : 0 p x ; c o l o r : r g b ( 0 , 0 , 0 ) ; f o n t − s i z e : 1 2 p x ; l i n e − h e i g h t : 1 . 5 ; " > s < / s p a n > " newLeaderSessionID.")
leaderSessionID = newLeaderSessionID
.......</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">有几个比较重要的消息:</p><ul style="margin-bottom:10px;margin-left:30px;"><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">GrantLeadership 获得leader授权,将自身被分发到的 session id 写到 zookeeper,并恢复所有的 jobs</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">RevokeLeadership 剥夺leader授权,打断清空所有的 job 信息,但是保留作业缓存,注销所有的 TaskManagers</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">RegisterTaskManagers 注册 TaskManager,如果之前已经注册过,则只给对应的 Instance 发送消息,否则启动注册逻辑:在 InstanceManager 中注册该 Instance 的信息,并停止 Instance BlobLibraryCacheManager 的端口【供下载 lib 包用】,同时使用 watch 监听 task manager 的存活</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">SubmitJob 提交 jobGraph <br style="margin:0px;padding:0px;">最后一项SubmintJob就是我们要关注的,从客户端收到JobGraph,转换为ExecutionGraph并执行的过程。</li></ul><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">private</span> def submitJob(jobGraph: JobGraph, jobInfo: JobInfo, isRecovery: Boolean = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">false</span>): Unit =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
......
executionGraph </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> ExecutionGraphBuilder.buildGraph(
executionGraph,
jobGraph,
flinkConfiguration,
futureExecutor,
ioExecutor,
scheduler,
userCodeLoader,
checkpointRecoveryFactory,
Time.of(timeout.length, timeout.unit),
restartStrategy,
jobMetrics,
numSlots,
blobServer,
log.logger)
......
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (leaderElectionService.hasLeadership) {
log.info(s</span>"Scheduling job $jobId ($jobName)."<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">)
executionGraph.scheduleForExecution()
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
self </span>! decorateMessage(RemoveJob(jobId, removeJobFromStateBackend = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">false</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">))
log.warn(s</span>"Submitted job $jobId, but not leader. The other leader needs to recover " +
"this. I am not scheduling the job for execution."<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">)
......
}
首先做一些准备工作,然后获取一个ExecutionGraph,判断是否是恢复的job,然后将job保存下来,并且通知客户端本地已经提交成功了,最后如果确认本JobManager是leader,则执行executionGraph.scheduleForExecution()
方法,这个方法经过一系列调用,把每个ExecutionVertex传递给了Excution类的deploy方法:
public void deploy() throws JobException {
......
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">try</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> good, we are allowed to deploy</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (!slot.setExecutedVertex(<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">)) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throw</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> JobException("Could not assign the ExecutionVertex to the slot " +<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> slot);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> race double check, did we fail/cancel and do we need to release the slot?</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span>.state !=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> DEPLOYING) {
slot.releaseSlot();
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">;
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (LOG.isInfoEnabled()) {
LOG.info(String.format(</span>"Deploying %s (attempt #%d) to %s"<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">, vertex.getTaskNameWithSubtaskIndex(),
attemptNumber, getAssignedResourceLocation().getHostname()));
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> TaskDeploymentDescriptor deployment =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> vertex.createDeploymentDescriptor(
attemptId,
slot,
taskState,
attemptNumber);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> TaskManagerGateway taskManagerGateway =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> slot.getTaskManagerGateway();
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> CompletableFuture<Acknowledge> submitResultFuture =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> taskManagerGateway.submitTask(deployment, timeout);
......
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">catch</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (Throwable t) {
markFailed(t);
ExceptionUtils.rethrow(t);
}
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">我们首先生成了一个TaskDeploymentDescriptor,然后交给了<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">taskManagerGateway.submitTask()</code>方法执行。接下来的部分,就属于TaskManager的范畴了。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h3 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:16px;font-family:inherit;line-height:1.5;color:inherit;"><a name="t14"></a>3.3 TaskManager执行task</h3><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h4 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:14px;font-family:inherit;line-height:20px;color:rgb(51,51,51);">3.3.1 TaskManager的基本组件</h4><p style="margin:10px auto;">TaskManager是flink中资源管理的基本组件,是所有执行任务的基本容器,提供了内存管理、IO管理、通信管理等一系列功能,本节对各个模块进行简要介绍。 <br style="margin:0px;padding:0px;">1. MemoryManager flink并没有把所有内存的管理都委托给JVM,因为JVM普遍存在着存储对象密度低、大内存时GC对系统影响大等问题。所以flink自己抽象了一套内存管理机制,将所有对象序列化后放在自己的MemorySegment上进行管理。MemoryManger涉及内容较多,将在后续章节进行继续剖析。 <br style="margin:0px;padding:0px;">2. IOManager flink通过IOManager管理磁盘IO的过程,提供了同步和异步两种写模式,又进一步区分了block、buffer和bulk三种读写方式。 <br style="margin:0px;padding:0px;">IOManager提供了两种方式枚举磁盘文件,一种是直接遍历文件夹下所有文件,另一种是计数器方式,对每个文件名以递增顺序访问。 <br style="margin:0px;padding:0px;">在底层,flink将文件IO抽象为FileIOChannle,封装了底层实现。 <br style="margin:0px;padding:0px;"><img title="" src="http://static.zybuluo.com/bethunebtj/d3j6qnbjywjzknu6pb3pou6i/image_1cag7idg4vfj1l871n0l1k0e1f7u4o.png" alt="image_1cag7idg4vfj1l871n0l1k0e1f7u4o.png-194.1kB" style="padding:0px;border:0px;height:auto;max-width:900px;vertical-align:middle;"><br style="margin:0px;padding:0px;">可以看到,flink在底层实际上都是以异步的方式进行读写。 <br style="margin:0px;padding:0px;">3. NetworkEnvironment 是TaskManager的网络 IO 组件,包含了追踪中间结果和数据交换的数据结构。它的构造器会统一将配置的内存先分配出来,抽象成 NetworkBufferPool 统一管理内存的申请和释放。意思是说,在输入和输出数据时,不管是保留在本地内存,等待chain在一起的下个操作符进行处理,还是通过网络把本操作符的计算结果发送出去,都被抽象成了NetworkBufferPool。后续我们还将对这个组件进行详细分析。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h4 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:14px;font-family:inherit;line-height:20px;color:rgb(51,51,51);">3.3.2 TaskManager执行Task</h4><p style="margin:10px auto;">对于TM来说,执行task就是把收到的<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">TaskDeploymentDescriptor</code>对象转换成一个task并执行的过程。TaskDeploymentDescriptor这个类保存了task执行所必须的所有内容,例如序列化的算子,输入的InputGate和输出的ResultPartition的定义,该task要作为几个subtask执行等等。 <br style="margin:0px;padding:0px;">按照正常逻辑思维,很容易想到TM的submitTask方法的行为:首先是确认资源,如寻找JobManager和Blob,而后建立连接,解序列化算子,收集task相关信息,接下来就是创建一个新的<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">Task</code>对象,这个task对象就是真正执行任务的关键所在。</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';">val task = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Task(
jobInformation,
taskInformation,
tdd.getExecutionAttemptId,
tdd.getAllocationId,
tdd.getSubtaskIndex,
tdd.getAttemptNumber,
tdd.getProducedPartitions,
tdd.getInputGates,
tdd.getTargetSlotNumber,
tdd.getTaskStateHandles,
memoryManager,
ioManager,
network,
bcVarManager,
taskManagerConnection,
inputSplitProvider,
checkpointResponder,
blobCache,
libCache,
fileCache,
config,
taskMetricGroup,
resultPartitionConsumableNotifier,
partitionStateChecker,
context.dispatcher)</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">如果读者是从头开始看这篇blog,里面有很多对象应该已经比较明确其作用了(除了那个brVarManager,这个是管理广播变量的,广播变量是一类会被分发到每个任务中的共享变量)。接下来的主要任务,就是把这个task启动起来,然后报告说已经启动task了:</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> all good, we kick off the task, which performs its own initialization</span>
task.startTaskThread()
sender ! decorateMessage(Acknowledge.get())
3.3.2.1 生成Task对象
在执行new Task()方法时,第一步是把构造函数里的这些变量赋值给当前task的fields。 接下来是初始化ResultPartition和InputGate。这两个类描述了task的输出数据和输入数据。
for (ResultPartitionDeploymentDescriptor desc: resultPartitionDeploymentDescriptors) {
ResultPartitionID partitionId = new ResultPartitionID(desc.getPartitionId(), executionId);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span>.producedPartitions[counter] = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> ResultPartition(
taskNameWithSubtaskAndId,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">,
jobId,
partitionId,
desc.getPartitionType(),
desc.getNumberOfSubpartitions(),
desc.getMaxParallelism(),
networkEnvironment.getResultPartitionManager(),
resultPartitionConsumableNotifier,
ioManager,
desc.sendScheduleOrUpdateConsumersMessage());
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">为每个partition初始化对应的writer </span>
writers[counter] = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> ResultPartitionWriter(producedPartitions[counter]);
</span>++<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">counter;
}
// Consumed intermediate result partitions this.inputGates = new SingleInputGate[inputGateDeploymentDescriptors.size()]; this.inputGatesById = new HashMap<>();
counter = 0;
for (InputGateDeploymentDescriptor inputGateDeploymentDescriptor: inputGateDeploymentDescriptors) { SingleInputGate gate = SingleInputGate.create( taskNameWithSubtaskAndId, jobId, executionId, inputGateDeploymentDescriptor, networkEnvironment, this, metricGroup.getIOMetricGroup());
inputGates[counter] </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> gate;
inputGatesById.put(gate.getConsumedResultId(), gate);
</span>++<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">counter;
}
最后,创建一个Thread对象,并把自己放进该对象,这样在执行时,自己就有了自身的线程的引用。
3.3.2.2 运行Task对象
Task对象本身就是一个Runable,因此在其run方法里定义了运行逻辑。 第一步是切换Task的状态:
while (true) {
ExecutionState current = this.executionState;
如果当前的执行状态为CREATED,则将其设置为DEPLOYING状态
if (current == ExecutionState.CREATED) {
if (transitionState(ExecutionState.CREATED, ExecutionState.DEPLOYING)) {
// success, we can start our work
break;
}
}
//如果当前执行状态为FAILED,则发出通知并退出run方法
else if (current == ExecutionState.FAILED) {
// we were immediately failed. tell the TaskManager that we reached our final state
notifyFinalState();
if (metrics != null) {
metrics.close();
}
return;
}
//如果当前执行状态为CANCELING,则将其修改为CANCELED状态,并退出run
else if (current == ExecutionState.CANCELING) {
if (transitionState(ExecutionState.CANCELING, ExecutionState.CANCELED)) {
// we were immediately canceled. tell the TaskManager that we reached our final state
notifyFinalState();
if (metrics != null) {
metrics.close();
}
return;
}
}
//否则说明发生了异常
else {
if (metrics != null) {
metrics.close();
}
throw new IllegalStateException("Invalid state for beginning of operation of task " + this + ‘.’);
}
}
接下来,就是导入用户类加载器并加载用户代码。 然后,是向网络管理器注册当前任务(flink的各个算子在运行时进行数据交换需要依赖网络管理器),分配一些缓存以保存数据 然后,读入指定的缓存文件。 然后,再把task创建时传入的那一大堆变量用于创建一个执行环境Envrionment。 再然后,对于那些并不是第一次执行的task(比如失败后重启的)要恢复其状态。 接下来最重要的是
invokable . invoke ();
方法。为什么这么说呢,因为这个方法就是用户代码所真正被执行的入口。比如我们写的什么new MapFunction()的逻辑,最终就是在这里被执行的。这里说一下这个invokable,这是一个抽象类,提供了可以被TaskManager执行的对象的基本抽象。 这个invokable是在解析JobGraph的时候生成相关信息的,并在此处形成真正可执行的对象
// now load the task’s invokable code
//通过反射生成对象
invokable = loadAndInstantiateInvokable(userCodeClassLoader, nameOfInvokableClass);
上图显示了flink提供的可被执行的Task类型。从名字上就可以看出各个task的作用,在此不再赘述。 接下来就是invoke方法了,因为我们的wordcount例子用了流式api,在此我们以StreamTask的invoke方法为例进行说明。
3.3.2.3 StreamTask的执行逻辑
先上部分核心代码:
public final void invoke() throws Exception {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">boolean</span> disposed = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">false</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">;
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">try</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> -------- Initialize ---------
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">先做一些赋值操作</span>
…
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> if the clock is not already set, then assign a default TimeServiceProvider
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">处理timer</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (timerService == <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
ThreadFactory timerThreadFactory </span>=
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> DispatcherThreadFactory(TRIGGER_THREAD_GROUP, "Time Trigger for " +<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> getName());
timerService </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> SystemProcessingTimeService(<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">, getCheckpointLock(), timerThreadFactory);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">把之前JobGraph串起来的chain的信息形成实现</span>
operatorChain = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> OperatorChain<>(<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
headOperator </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> operatorChain.getHeadOperator();
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> task specific initialization
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">这个init操作的起名非常诡异,因为这里主要是处理算子采用了自定义的checkpoint检查机制的情况,但是起了一个非常大众脸的名字</span>
init();
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> save the work of reloading state, etc, if the task is already canceled</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (canceled) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throw</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> CancelTaskException();
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> -------- Invoke --------</span>
LOG.debug("Invoking {}"<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">, getName());
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> we need to make sure that any triggers scheduled in open() cannot be
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> executed before all operators are opened</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">synchronized</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (lock) {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> both the following operations are protected by the lock
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> so that we avoid race conditions in the case that initializeState()
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> registers a timer, that fires before the open() is called.
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">初始化操作符状态,主要是一些state啥的</span>
initializeState(); //对于富操作符,执行其open操作 openAllOperators(); }
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> final check to exit early before starting to run</span>
f (canceled) { throw new CancelTaskException(); }
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> let the task do its work
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">真正开始执行的代码</span>
isRunning = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">true</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">;
run();</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">StreamTask.invoke()方法里,第一个值得一说的是<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">TimerService</code>。Flink在2015年决定向StreamTask类加入timer service的时候解释到:</p><blockquote class="white-blockquote" style="margin-bottom:20px;padding:10px 15px;background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border-width:2px 2px 2px 10px;border-style:solid;border-color:rgb(214,219,223);line-height:1.6;color:rgb(51,51,51);"><p style="margin:10px auto;line-height:25px;">This integrates the timer as a service in StreamTask that StreamOperators can use by calling a method on the StreamingRuntimeContext. This also ensures that the timer callbacks can not be called concurrently with other methods on the StreamOperator. This behaviour is ensured by an ITCase.</p></blockquote><p style="margin:10px auto;">第二个要注意的是chain操作。前面提到了,flink会出于优化的角度,把一些算子chain成一个整体的算子作为一个task来执行。比如wordcount例子中,Source和FlatMap算子就被chain在了一起。在进行chain操作的时候,会设定头节点,并且指定输出的RecordWriter。</p><p style="margin:10px auto;">接下来不出所料仍然是初始化,只不过初始化的对象变成了各个operator。如果是有checkpoint的,那就从state信息里恢复,不然就作为全新的算子处理。从源码中可以看到,flink针对keyed算子和普通算子做了不同的处理。keyed算子在初始化时需要计算出一个group区间,这个区间的值在整个生命周期里都不会再变化,后面key就会根据hash的不同结果,分配到特定的group中去计算。顺便提一句,flink的keyed算子保存的是对每个数据的key的计算方法,而非真实的key,用户需要自己保证对每一行数据提供的keySelector的幂等性。至于为什么要用KeyGroup的设计,这就牵扯到扩容的范畴了,将在后面的章节进行讲述。 <br style="margin:0px;padding:0px;">对于<code style="margin:0px;padding:2px 4px;font-family:Monaco, Menlo, Consolas, 'Courier New', monospace;font-size:14px;color:rgb(44,62,80);white-space:nowrap;background-color:rgb(214,219,223);border:0px;">openAllOperators()</code>方法,就是对各种RichOperator执行其open方法,通常可用于在执行计算之前加载资源。 <br style="margin:0px;padding:0px;">最后,run方法千呼万唤始出来,该方法经过一系列跳转,最终调用chain上的第一个算子的run方法。在wordcount的例子中,它最终调用了SocketTextStreamFunction的run,建立socket连接并读入文本。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h3 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:16px;font-family:inherit;line-height:1.5;color:inherit;"><a name="t15"></a>3.4 StreamTask与StreamOperator</h3><p style="margin:10px auto;">前面提到,Task对象在执行过程中,把执行的任务交给了StreamTask这个类去执行。在我们的wordcount例子中,实际初始化的是OneInputStreamTask的对象(参考上面的类图)。那么这个对象是如何执行用户的代码的呢?</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">protected</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">void</span> run() <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throws</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Exception {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> cache processor reference on the stack, to make the code more JIT friendly</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> StreamInputProcessor<IN> inputProcessor = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">.inputProcessor;
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">while</span> (running &&<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> inputProcessor.processInput()) {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> all the work happens in the "processInput" method</span>
} }
它做的,就是把任务直接交给了InputProcessor去执行processInput方法。这是一个StreamInputProcessor
的实例,该processor的任务就是处理输入的数据,包括用户数据、watermark和checkpoint数据等。我们先来看看这个processor是如何产生的:
public void init() throws Exception {
StreamConfig configuration = getConfiguration();
TypeSerializer</span><IN> inSerializer =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> configuration.getTypeSerializerIn1(getUserCodeClassLoader());
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">int</span> numberOfInputs =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> configuration.getNumberOfInputs();
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (numberOfInputs > 0<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
InputGate[] inputGates </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> getEnvironment().getAllInputGates();
inputProcessor </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> StreamInputProcessor<><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">(
inputGates,
inSerializer,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">,
configuration.getCheckpointMode(),
getCheckpointLock(),
getEnvironment().getIOManager(),
getEnvironment().getTaskManagerInfo().getConfiguration(),
getStreamStatusMaintainer(),
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">.headOperator);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> make sure that stream tasks report their I/O statistics</span>
inputProcessor.setMetricGroup(getEnvironment().getMetricGroup().getIOMetricGroup()); } }
这是OneInputStreamTask的init方法,从configs里面获取StreamOperator信息,生成自己的inputProcessor。那么inputProcessor是如何处理数据的呢?我们接着跟进源码:
public boolean processInput() throws Exception {
if (isFinished) {
return false;
}
if (numRecordsIn == null) {
numRecordsIn = ((OperatorMetricGroup) streamOperator.getMetricGroup()).getIOMetricGroup().getNumRecordsInCounter();
}
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">这个while是用来处理单个元素的(不要想当然以为是循环处理元素的)</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">while</span> (<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">true</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">注意 1在下面
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">2.接下来,会利用这个反序列化器得到下一个数据记录,并进行解析(是用户数据还是watermark等等),然后进行对应的操作</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (currentRecordDeserializer != <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
DeserializationResult result </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> currentRecordDeserializer.getNextRecord(deserializationDelegate);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (result.isBufferConsumed()) {
currentRecordDeserializer.getCurrentBuffer().recycle();
currentRecordDeserializer </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">;
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (result.isFullRecord()) {
StreamElement recordOrMark </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> deserializationDelegate.getInstance();
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">如果元素是watermark,就准备更新当前channel的watermark值(并不是简单赋值,因为有乱序存在),</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (recordOrMark.isWatermark()) {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> handle watermark</span>
statusWatermarkValve.inputWatermark(recordOrMark.asWatermark(), currentChannel); continue; } else if (recordOrMark.isStreamStatus()) { //如果元素是status,就进行相应处理。可以看作是一个flag,标志着当前stream接下来即将没有元素输入(idle),或者当前即将由空闲状态转为有元素状态(active)。同时,StreamStatus还对如何处理watermark有影响。通过发送status,上游的operator可以很方便的通知下游当前的数据流的状态。 // handle stream status statusWatermarkValve.inputStreamStatus(recordOrMark.asStreamStatus(), currentChannel); continue; } else if (recordOrMark.isLatencyMarker()) { //LatencyMarker是用来衡量代码执行时间的。在Source处创建,携带创建时的时间戳,流到Sink时就可以知道经过了多长时间 // handle latency marker synchronized (lock) { streamOperator.processLatencyMarker(recordOrMark.asLatencyMarker()); } continue; } else { //这里就是真正的,用户的代码即将被执行的地方。从章节1到这里足足用了三万字,有点万里长征的感觉 // now we can do the actual processing StreamRecord<IN> record = recordOrMark.asRecord(); synchronized (lock) { numRecordsIn.inc(); streamOperator.setKeyContextElement1(record); streamOperator.processElement(record); } return true; } } }
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">1.程序首先获取下一个buffer
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">这一段代码是服务于flink的FaultTorrent机制的,后面我会讲到,这里只需理解到它会尝试获取buffer,然后赋值给当前的反序列化器</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> BufferOrEvent bufferOrEvent =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> barrierHandler.getNextNonBlocked();
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (bufferOrEvent != <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (bufferOrEvent.isBuffer()) {
currentChannel </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> bufferOrEvent.getChannelIndex();
currentRecordDeserializer </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> recordDeserializers[currentChannel];
currentRecordDeserializer.setNextBuffer(bufferOrEvent.getBuffer());
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> Event received</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> AbstractEvent event =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> bufferOrEvent.getEvent();
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (event.getClass() != EndOfPartitionEvent.<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">class</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throw</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> IOException("Unexpected event: " +<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> event);
}
}
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">else</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
isFinished </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">true</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">;
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (!<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">barrierHandler.isEmpty()) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throw</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> IllegalStateException("Trailing data in checkpoint barrier handler."<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">return</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">false</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">;
}
}
}</span></pre><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div></div><p style="margin:10px auto;"> </p></div><p style="margin:10px auto;">到此为止,以上部分就是一个flink程序启动后,到执行用户代码之前,flink框架所做的准备工作。回顾一下:</p><ul style="margin-bottom:10px;margin-left:30px;"><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">启动一个环境</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">生成StreamGraph</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">注册和选举JobManager</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">在各节点生成TaskManager,并根据JobGraph生成对应的Task</li><li style="margin-top:0px;margin-bottom:1em;margin-left:0px;padding:0px;line-height:27px;">启动各个task,准备执行代码</li></ul><p style="margin:10px auto;">接下来,我们挑几个Operator看看flink是如何抽象这些算子的。</p><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h2 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:21px;font-family:inherit;line-height:1.5;color:inherit;"><a name="t16"></a>4. StreamOperator的抽象与实现</h2><div class="md-section-divider" style="margin:0px;padding:0px;"> </div><h3 style="margin-top:10px;margin-bottom:10px;padding:0px;font-size:16px;font-family:inherit;line-height:1.5;color:inherit;"><a name="t17"></a>4.1 数据源的逻辑——StreamSource与时间模型</h3><p style="margin:10px auto;">StreamSource抽象了一个数据源,并且指定了一些如何处理数据的模式。</p><div class="md-section-divider" style="margin:0px;padding:0px;"><div class="cnblogs_code" style="margin:5px 0px;padding:5px;background-color:rgb(245,245,245);border:1px solid rgb(204,204,204);font-family:'Courier New';font-size:12px;"><div class="cnblogs_code_toolbar" style="margin:5px 0px 0px;padding:0px;"><span class="cnblogs_code_copy" style="margin:0px;padding:0px 5px 0px 0px;line-height:1.5;"><a title="复制代码" style="margin:0px;padding:0px;color:#000000;border:none;" target="_blank"><img src="https://img-blog.csdnimg.cn/20181222193246188" alt="复制代码" style="margin-bottom:0px;padding:0px;height:auto;max-width:900px;vertical-align:middle;border:none;"></a></span></div><pre style="margin-bottom:10px;padding:10px 15px;white-space:pre-wrap;font-size:14px;color:rgb(51,51,51);background:rgba(102,128,153,.05) none repeat scroll 0px 0px;border:0px solid rgba(0,0,0,.15);font-family:'Courier New';"><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">public</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">class</span> StreamSource<OUT, SRC <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">extends</span> SourceFunction<OUT>>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">extends</span> AbstractUdfStreamOperator<OUT, SRC> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">implements</span> StreamOperator<OUT><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
......
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">public</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">void</span> run(<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> Object lockingObject, <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> StreamStatusMaintainer streamStatusMaintainer) <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throws</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Exception {
run(lockingObject, streamStatusMaintainer, output);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">public</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">void</span> run(<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Object lockingObject,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> StreamStatusMaintainer streamStatusMaintainer,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> Output<StreamRecord<OUT>> collector) <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throws</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Exception {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> TimeCharacteristic timeCharacteristic =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> getOperatorConfig().getTimeCharacteristic();
LatencyMarksEmitter latencyEmitter </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">null</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">;
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (getExecutionConfig().isLatencyTrackingEnabled()) {
latencyEmitter </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> LatencyMarksEmitter<><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">(
getProcessingTimeService(),
collector,
getExecutionConfig().getLatencyTrackingInterval(),
getOperatorConfig().getVertexID(),
getRuntimeContext().getIndexOfThisSubtask());
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">long</span> watermarkInterval =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval();
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">this</span>.ctx =<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> StreamSourceContexts.getSourceContext(
timeCharacteristic,
getProcessingTimeService(),
lockingObject,
streamStatusMaintainer,
collector,
watermarkInterval,
</span>-1<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">try</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
userFunction.run(ctx);
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> if we get here, then the user function either exited after being done (finite source)
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> or the function was canceled or stopped. For the finite source case, we should emit
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> a final watermark that indicates that we reached the end of event-time</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">if</span> (!<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">isCanceledOrStopped()) {
ctx.emitWatermark(Watermark.MAX_WATERMARK);
}
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">finally</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> make sure that the context is closed in any case</span>
ctx.close(); if (latencyEmitter != null) { latencyEmitter.close(); } } }
......
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">private</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">static</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">class</span> LatencyMarksEmitter<OUT><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">private</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> ScheduledFuture<?><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> latencyMarkTimer;
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">public</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> LatencyMarksEmitter(
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> ProcessingTimeService processingTimeService,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> Output<StreamRecord<OUT>><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> output,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">long</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> latencyTrackingInterval,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">int</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> vertexID,
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">final</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">int</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> subtaskIndex) {
latencyMarkTimer </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> processingTimeService.scheduleAtFixedRate(
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> ProcessingTimeCallback() {
@Override
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">public</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">void</span> onProcessingTime(<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">long</span> timestamp) <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">throws</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Exception {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">try</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> ProcessingTimeService callbacks are executed under the checkpointing lock</span>
output.emitLatencyMarker(<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> LatencyMarker(timestamp, vertexID, subtaskIndex));
} </span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">catch</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (Throwable t) {
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> we catch the Throwables here so that we don't trigger the processing
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;"> timer services async exception handler</span>
LOG.warn("Error while emitting latency marker."<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">, t);
}
}
},
</span>0L<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">,
latencyTrackingInterval);
}
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">public</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">void</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> close() {
latencyMarkTimer.cancel(</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">true</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">);
}
}
}
在StreamSource生成上下文之后,接下来就是把上下文交给SourceFunction去执行:
userFunction . run ( ctx );
SourceFunction是对Function的一个抽象,就好像MapFunction,KeyByFunction一样,用户选择实现这些函数,然后flink框架就能利用这些函数进行计算,完成用户逻辑。 我们的wordcount程序使用了flink提供的一个SocketTextStreamFunction
。我们可以看一下它的实现逻辑,对source如何运行有一个基本的认识:
public void run(SourceContext<String> ctx) throws Exception {
final StringBuilder buffer = new StringBuilder();
long attempt = 0;
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">while</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> (isRunning) {
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">try</span> (Socket socket = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> Socket()) {
currentSocket </span>=<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> socket;
LOG.info(</span>"Connecting to server socket " + hostname + ':' +<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> port);
socket.connect(</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> InetSocketAddress(hostname, port), CONNECTION_TIMEOUT_TIME);
BufferedReader reader </span>= <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> BufferedReader(<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> InputStreamReader(socket.getInputStream()));
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">char</span>[] cbuf = <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">new</span> <span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">char</span>[8192<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">];
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">int</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> bytesRead;
</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">//</span><span style="margin:0px;padding:0px;color:rgb(0,128,0);font-size:12px;line-height:1.5;">核心逻辑就是一直读inputSocket,然后交给collect方法</span>
<span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">while</span> (isRunning && (bytesRead = reader.read(cbuf)) != -1<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">) {
buffer.append(cbuf, </span>0<span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;">, bytesRead);
</span><span style="margin:0px;padding:0px;color:rgb(0,0,255);font-size:12px;line-height:1.5;">int</span><span style="margin:0px;padding:0px;color:rgb(0,0,0);font-size:12px;line-height:1.5;"> delimPos;
</span><span style="margin:0px;padding:0px;color:rgb