1、TaskExecutor 执行一个 Task
TaskExecutor.submitTask
Task task = new Task(
// TODO taskExecutorServices.createShuffleEnvironment
taskExecutorServices.getShuffleEnvironment()
创建ShuffleEnvironment T
TaskManager启动时创建ShuffleEnvironment
startTaskManager-> TaskManagerServices.fromConfiguration ->createShuffleEnvironment
-> NettyShuffleServiceFactory.createShuffleEnvironment -> createNettyShuffleEnvironment(){
NettyConfig nettyConfig = config.nettyConfig();
/*************************************************
* TODO_MA
* 注释: 返回: FileChannelManagerImpl
*/
FileChannelManager fileChannelManager = new FileChannelManagerImpl(config.getTempDirs(), DIR_NAME_PREFIX);
/*************************************************
* TODO_MA
* 注释: 返回: NettyConnectionManager
*/
ConnectionManager connectionManager = nettyConfig != null ?
new NettyConnectionManager(resultPartitionManager, taskEventPublisher, nettyConfig)
:new LocalConnectionManager();
/*************************************************
* TODO_MA
* 注释: 返回: NetworkBufferPool
*/
NetworkBufferPool networkBufferPool = new NetworkBufferPool(config.numNetworkBuffers(), config.networkBufferSize(),
config.networkBuffersPerChannel(), config.getRequestSegmentsTimeout());
registerShuffleMetrics(metricGroup, networkBufferPool);
/*************************************************
* TODO_MA
* 注释: 构建 ResultPartitionFactory
*/
ResultPartitionFactory resultPartitionFactory = new ResultPartitionFactory(resultPartitionManager, fileChannelManager, networkBufferPool,
config.getBlockingSubpartitionType(), config.networkBuffersPerChannel(), config.floatingNetworkBuffersPerGate(),
config.networkBufferSize(), config.isForcePartitionReleaseOnConsumption(), config.isBlockingShuffleCompressionEnabled(),
config.getCompressionCodec(), config.getMaxBuffersPerChannel());
/*************************************************
* TODO_MA
* 注释: 构建 SingleInputGateFactory
*/
SingleInputGateFactory singleInputGateFactory = new SingleInputGateFactory(taskExecutorResourceId, config, connectionManager,
resultPartitionManager, taskEventPublisher, networkBufferPool);
/*************************************************
* TODO_MA
* 注释: NettyShuffleEnvironment
*/
return new NettyShuffleEnvironment(taskExecutorResourceId, config, networkBufferPool, connectionManager, resultPartitionManager,
fileChannelManager, resultPartitionFactory, singleInputGateFactory, ioExecutor);
}
--> NettyConnectionManager
public NettyConnectionManager(ResultPartitionProvider partitionProvider, TaskEventPublisher taskEventPublisher, NettyConfig nettyConfig) {
/*************************************************
* TODO
* 注释: 初始化一个 NettyServer
*/
this.server = new NettyServer(nettyConfig);
/*************************************************
* TODO
* 注释: 初始化一个 NettyClient
*/
this.client = new NettyClient(nettyConfig);
this.bufferPool = new NettyBufferPool(nettyConfig.getNumberOfArenas());
this.partitionRequestClientFactory = new PartitionRequestClientFactory(client);
// TODO
this.nettyProtocol = new NettyProtocol(checkNotNull(partitionProvider), checkNotNull(taskEventPublisher));
}
-------------------------------
NettyServer 绑定handler PartitionRequestServerHandler
NettyClient 绑定handler CreditBasedPartitionRequestClientHandler
public class NettyProtocol {
private final NettyMessage.NettyMessageEncoder
messageEncoder = new NettyMessage.NettyMessageEncoder();
private final ResultPartitionProvider partitionProvider;
private final TaskEventPublisher taskEventPublisher;
NettyProtocol(ResultPartitionProvider partitionProvider, TaskEventPublisher taskEventPublisher) {
this.partitionProvider = partitionProvider;
this.taskEventPublisher = taskEventPublisher;
}
/**
* Returns the server channel handlers.
*
* <pre>
* +-------------------------------------------------------------------+
* | SERVER CHANNEL PIPELINE |
* | |
* | +----------+----------+ (3) write +----------------------+ |
* | | Queue of queues +----------->| Message encoder | |
* | +----------+----------+ +-----------+----------+ |
* | /|\ \|/ |
* | | (2) enqueue | |
* | +----------+----------+ | |
* | | Request handler | | |
* | +----------+----------+ | |
* | /|\ | |
* | | | |
* | +-----------+-----------+ | |
* | | Message+Frame decoder | | |
* | +-----------+-----------+ | |
* | /|\ | |
* +---------------+-----------------------------------+---------------+
* | | (1) client request \|/
* +---------------+-----------------------------------+---------------+
* | | | |
* | [ Socket.read() ] [ Socket.write() ] |
* | |
* | Netty Internal I/O Threads (Transport Implementation) |
* +-------------------------------------------------------------------+
* </pre>
*
* @return channel handlers
*/
public ChannelHandler[] getServerChannelHandlers() {
PartitionRequestQueue queueOfPartitionQueues = new PartitionRequestQueue();
/**
* TODO
* 负责处理消费端通过PartitionRequestClient发送的PartitionRequest和AddCredit请求
*/
PartitionRequestServerHandler serverHandler = new PartitionRequestServerHandler(
partitionProvider,
taskEventPublisher,
queueOfPartitionQueues);
return new ChannelHandler[] {
messageEncoder,
new NettyMessage.NettyMessageDecoder(),
serverHandler,
queueOfPartitionQueues
};
}
/**
* Returns the client channel handlers.
*
* <pre>
* +-----------+----------+ +----------------------+
* | Remote input channel | | request client |
* +-----------+----------+ +-----------+----------+
* | | (1) write
* +---------------+-----------------------------------+---------------+
* | | CLIENT CHANNEL PIPELINE | |
* | | \|/ |
* | +----------+----------+ +----------------------+ |
* | | Request handler + | Message encoder | |
* | +----------+----------+ +-----------+----------+ |
* | /|\ \|/ |
* | | | |
* | +----------+------------+ | |
* | | Message+Frame decoder | | |
* | +----------+------------+ | |
* | /|\ | |
* +---------------+-----------------------------------+---------------+
* | | (3) server response \|/ (2) client request
* +---------------+-----------------------------------+---------------+
* | | | |
* | [ Socket.read() ] [ Socket.write() ] |
* | |
* | Netty Internal I/O Threads (Transport Implementation) |
* +-------------------------------------------------------------------+
* </pre>
*
* @return channel handlers
*
* NettyClient 中的handler
*/
public ChannelHandler[] getClientChannelHandlers() {
NetworkClientHandler networkClientHandler = new CreditBasedPartitionRequestClientHandler();
return new ChannelHandler[]{
messageEncoder,
new NettyMessageClientDecoderDelegate(networkClientHandler),
networkClientHandler};
}
}
public Task()
1、初始化ResultPartition 和 ResultSubPartition
/*************************************************
* TODO 一个task的执行有输入和输出,关于输出的抽象 ResultPatition 和ResultSubPartition
* 注释: 初始化 ResultPartitionerWriter 具体实现是 ResultPatition
*/
// produced intermediate result partitions
final ResultPartitionWriter[] resultPartitionWriters = shuffleEnvironment
// TODO NettyShuffleEnvironment
.createResultPartitionWriters(taskShuffleContext, resultPartitionDeploymentDescriptors).toArray(new ResultPartitionWriter[]{});
2、初始化InputGate
/*************************************************
* TODO 一个task的执行有输入和输出,关于输入的抽象 InputGate InputChannel(从上游一个task节点拉取数据)
* LocalRecoveredInputChannel 或 RemoteRecoveredInputChannel 本地或远程拉取
* 注释: 初始化 InputGate
*/
// consumed intermediate result partitions
final IndexedInputGate[] gates = shuffleEnvironment
// TODO NettyShuffleEnvironment
.createInputGates(taskShuffleContext, this, inputGateDeploymentDescriptors)
.toArray(new IndexedInputGate[0]);
--> createInputGates -> singleInputGateFactory.create -> createBufferPoolFactory -> createBufferPool
-> internalCreateBufferPool -> new LocalBufferPool(创建一个 LocalBufferPool)
3、包装
/*************************************************
* TODO 包装
* 注释: 对上述生成的 ResultPartition 再根据是否需要发回反馈信息等,进行进一步对象的处理
*/
this.consumableNotifyingPartitionWriters = ConsumableNotifyingResultPartitionWriterDecorator
.decorate(resultPartitionDeploymentDescriptors, resultPartitionWriters, this, jobId, resultPartitionConsumableNotifier);
4、 执行 Task 的线程 实例化
/*************************************************
* TODO
* 注释: 执行 Task 的线程 实例化 ,TaskExecutor的 task.startTaskThread();
* 启动线程 转到 Task 的 run() 方法
*/
// finally, create the executing thread, but do not start it
executingThread = new Thread(TASK_THREADS_GROUP, this, taskNameWithSubtask);
-> Task.run
2. SourceStreamTask 和 StreamTask 初始化
首先需要了解的第一个知识点:在最开始一个 job 提交到 Flink standalone 集群运行的时候,在 client
构建 StreamGraph(顶点是 StreamNode,边是 StreamEdge) 的时候,会根据用户调用的算子生成
的 Transformation 为 StreamGraph 生成 StreamNode,在生成 StreamNode 的时候,会通过
OpearatorFactory 执行判断,如果该 StreamOperator 是 StreamSource 的话,就会指定该
StreamTask 的 invokableClass 为 SourceStreamTask, 否则为 (OneInputStreamTask,
TwoInputStreamTask, StreamTask)。核心代码是:
StreamGraph.addOperator(....){
invokableClass = operatorFactory.isStreamSource() ? SourceStreamTask.class :
OneInputStreamTask.class;
}
-> SourceStreamTask
/*************************************************
* TODO
* 注释: SourceStreamTask 其实是 Flink job 的最开始的 Task, 毫无疑问,就是对接 Source 的Task
* 有一个专门的线程来接收数据: LegacySourceFunctionThread
*/
private SourceStreamTask(Environment env, Object lock) throws Exception {
/*************************************************
* TODO
* 注释: SynchronizedStreamTaskActionExecutor
*/
super(env, null, FatalExitExceptionHandler.INSTANCE, StreamTaskActionExecutor.synchronizedExecutor(lock));
this.lock = Preconditions.checkNotNull(lock);
// TODO_MA 注释: 初始化一个线程:LegacySourceFunctionThread
// TODO_MA 注释: 这是 source 用于产生 data 的一个线程
// 运行于一个 Task的内部 , 用来给当前这个 SourceStreamTask 接收数据
this.sourceThread = new LegacySourceFunctionThread();
}
-> OneInputStreamTask
public OneInputStreamTask(Environment env) throws Exception {
// TODO_MA 注释: 调用父类构造
super(env);
}
// 以上两个super都进入 StreamTask
super -> StreamTask
/*************************************************
* TODO
* 注释: StreamTask 最终的构造方法
*/
protected StreamTask(Environment environment, @Nullable TimerService timerService, Thread.UncaughtExceptionHandler uncaughtExceptionHandler,
StreamTaskActionExecutor actionExecutor, TaskMailbox mailbox) throws Exception {
super(environment);
this.configuration = new StreamConfig(getTaskConfiguration());
/*************************************************
* TODO
* 注释: 创建 RecordWriter, 大概率是:ChannelSelectorRecordWriter, 也有可能是个 BroadcastRecordWriter
*
* 当一个Task真正运行的时候,其实 输入的真正工作的完成 是由 RecordReader完成(批处理)
* 输出的真正工作是 RecordWriter完成
*
* 如果你现在执行的 这个Task 是一个 OperatorChain,必然内部有 多个算子
* 最后一个 算子的输出是 RecordWriterOutPut
* 前面的算子 的输出就是:ChainingOutput
*
* T1(ChainingOutput) ---> T2(ChainingOutput) ---> T3(RecordWriterOutPut)
*
*
*/
this.recordWriter = createRecordWriterDelegate(configuration, environment);
// TODO_MA 注释: SynchronizedStreamTaskActionExecutor
this.actionExecutor = Preconditions.checkNotNull(actionExecutor);
/*************************************************
* TODO
* 注释: 初始化 StreamTask 的时候,初始化 MailboxProcessor, 同时,执行 StreamTask 的 processInput() 方法
* 1、如果为 SourceStreamTask 的话,processInput 方法会启动 SourceStreamTask 的 sourceThread
* 2、如果为其他的非 SourceStreamTask 的话,则根据情况(StreamOneInputProcessor 或者 StreamTwoInputProcessor)处理输入情况
* -
* 第二个参数:TaskMailboxImpl
* 第三个参数:SynchronizedStreamTaskActionExecutor
*
* 当前 这个Task 接收的要执行的任何的任务 ,都被封装成一个邮件 Mail
* 然后被置于 MailBox中
* 必然会有 一个 组件 去 轮询 这个MailBox 获取mail来执行处理
*/
this.mailboxProcessor = new MailboxProcessor(this::processInput, mailbox, actionExecutor);
/*************************************************
* TODO
* 注释: 当这里执行完了, SourceStreamTask 的接收数据线程,就卡在接收数据哪儿了。
*/
this.mailboxProcessor.initMetric(environment.getMetricGroup());
this.mainMailboxExecutor = mailboxProcessor.getMainMailboxExecutor(); // TODO_MA 注释: MailboxExecutorImpl
this.asyncExceptionHandler = new StreamTaskAsyncExceptionHandler(environment);
this.asyncOperationsThreadPool = Executors.newCachedThreadPool(new ExecutorThreadFactory("AsyncOperations", uncaughtExceptionHandler));
/*************************************************
* TODO
* 注释: 创建 StateBackend
* 根据参数 state.backend 来创建响应的 StateBackend
* -
* 1、MemoryStateBackend 把状态存储在job manager的内存中
* 2、FsStateBackend 把状态存在文件系统中,有可能是本地文件系统,也有可能是HDFS、S3等分布式文件系统
* 3、RocksDBStateBackend 把状态存在 RocksDB 中
* -
* 按照我们的配置,一般获取到的是 FsStateBackend
*/
this.stateBackend = createStateBackend();
/*************************************************
* TODO
* 注释: 初始化 SubtaskCheckpointCoordinatorImpl
*/
this.subtaskCheckpointCoordinator = new SubtaskCheckpointCoordinatorImpl(
/*************************************************
* TODO
* 注释: 创建 CheckpointStorage
* 1、FsStateBackend = FsCheckpointStorage
*/
stateBackend.createCheckpointStorage(getEnvironment().getJobID()), getName(), actionExecutor, getCancelables(),
getAsyncOperationsThreadPool(), getEnvironment(), this, configuration.isUnalignedCheckpointsEnabled(), this::prepareInputSnapshot);
// TODO_MA 注释: 时间语义服务 初始化
// TODO_MA 注释: ProcessingTime, EventTime, InjestioniTime
// if the clock is not already set, then assign a default TimeServiceProvider
if(timerService == null) {
ThreadFactory timerThreadFactory = new DispatcherThreadFactory(TRIGGER_THREAD_GROUP, "Time Trigger for " + getName());
this.timerService = new SystemProcessingTimeService(this::handleTimerException, timerThreadFactory);
} else {
this.timerService = timerService;
}
/*************************************************
* TODO
* 注释: 创建 Channel 的 IO 线程池
*/
this.channelIOExecutor = Executors.newSingleThreadExecutor(new ExecutorThreadFactory("channel-state-unspilling"));
}
其中在 SourceStreamTask 的 processInput() 方法中,主要是启动接收数据的线程
LegacySourceFunctionThread。
当构造方法完毕的时候,LegacySourceFunctionThread 已经初始化好了,但是 headOperator 还是
null,所以,LegacySourceFunctionThread 还未真正启动。
所以当 ExecutionVertex 真正被提交到 TaskExecutor 中运行的时候,被封装的 Execution 对应的 Task
类的启动类 AbstractInvokable 就是在构建 StreamGraph 的时候指定的对应的 invokableClass。所以
1、如果启动 SourceStreamTask,则启动类是:SourceStreamTask
2、如果启动非 SourceStreamTask,则启动类是:StreamTask
SourceStreamTask 的构造过程。核心入口:
Task.run -> doRun()
/**
* 重点 13 步
*/
private void doRun() {
// ----------------------------
// Initial State transition
// ----------------------------
while(true) {
ExecutionState current = this.executionState;
// 第一步 将task状态 由 CREATED 改成:DEPLOYING, 然后退出
if(current == ExecutionState.CREATED) {
if(transitionState(ExecutionState.CREATED, ExecutionState.DEPLOYING)) {
// success, we can start our work
break;
}
} else if(current == ExecutionState.FAILED) {
// we were immediately failed. tell the TaskManager that we reached our final state
notifyFinalState();
if(metrics != null) {
metrics.close();
}
return;
} else if(current == ExecutionState.CANCELING) {
if(transitionState(ExecutionState.CANCELING, ExecutionState.CANCELED)) {
// we were immediately canceled. tell the TaskManager that we reached our final state
notifyFinalState();
if(metrics != null) {
metrics.close();
}
return;
}
} else {
if(metrics != null) {
metrics.close();
}
throw new IllegalStateException("Invalid state for beginning of operation of task " + this + '.');
}
}
// all resource acquisitions and registrations from here on
// need to be undone in the end
Map<String, Future<Path>> distributedCacheEntries = new HashMap<>();
// TODO_MA 注释: 当时在构建 ExecutorGraph 的时候,会帮我们把每一个 ExecutorVertex 的启动类都会初始化好,设置在
// TODO_MA 注释: 设置在 ExecutorVertex 里面
// TODO_MA 注释: Slot ===> Task ===> ExecutorVertex ===> 启动类
AbstractInvokable invokable = null;
try {
// ----------------------------
// Task Bootstrap - We periodically
// check for canceling as a shortcut
// ----------------------------
// activate safety net for task thread
LOG.debug("Creating FileSystem stream leak safety net for task {}", this);
FileSystemSafetyNet.initializeSafetyNetForThread();
// first of all, get a user-code classloader
// this may involve downloading the job's JAR files and/or classes
LOG.info("Loading JAR files for task {}.", this);
/*************************************************
* TODO
* 注释: 创建一个用户加载用户代码的类加载器
*/
userCodeClassLoader = createUserCodeClassloader();
/*************************************************
* TODO 第二步、准备 ExecutionConfig
* 注释: 通过反序列化得到 ExecutionConfig,从 ExecutionConfig 中可以的到所有算子相关的信息
*/
final ExecutionConfig executionConfig = serializedExecutionConfig.deserializeValue(userCodeClassLoader);
if(executionConfig.getTaskCancellationInterval() >= 0) {
// override task cancellation interval from Flink config if set in ExecutionConfig
taskCancellationInterval = executionConfig.getTaskCancellationInterval();
}
if(executionConfig.getTaskCancellationTimeout() >= 0) {
// override task cancellation timeout from Flink config if set in ExecutionConfig
taskCancellationTimeout = executionConfig.getTaskCancellationTimeout();
}
if(isCanceledOrFailed()) {
throw new CancelTaskException();
}
// ----------------------------------------------------------------
// register the task with the network stack
// this operation may fail if the system does not have enough
// memory to run the necessary data exchanges
// the registration must also strictly be undone
// ----------------------------------------------------------------
LOG.info("Registering task at network: {}.", this);
/*************************************************
* TODO 第三步 : 注册输入和输出组件 启动 ResultPartitionWriter 和 InputGate
*
* 注释: 启动 ResultPartitionWriter 和 InputGate
* 向网络栈中注册 Task,为 ResultPartition 和 InputGate 分配缓冲池
* 原来在初始化 Task 的时候,就已经把 ResultPartition 和 InputGate 给初始化
* 原来在构造 Task 对象的时候,关于输入 和 输出的抽象对象,都已经创建完毕
* 其实就是初始化 BufferPool
*/
setupPartitionsAndGates(consumableNotifyingPartitionWriters, inputGates);
/**
* 第四步 注册 ResultPartitionWriter 到 taskEventDispatcher
*
*/
for(ResultPartitionWriter partitionWriter : consumableNotifyingPartitionWriters) {
taskEventDispatcher.registerPartition(partitionWriter.getPartitionId());
}
// next, kick off the background copying of files for the distributed cache
/**
* 第五步: DistributedCache.readFileInfoFromConfig
* 从分布式缓存中 ,拷贝下来一些运行 Task 所需要的资源文件
*/
try {
for(Map.Entry<String, DistributedCache.DistributedCacheEntry> entry : DistributedCache.readFileInfoFromConfig(jobConfiguration)) {
LOG.info("Obtaining local cache file for '{}'.", entry.getKey());
Future<Path> cp = fileCache.createTmpFile(entry.getKey(), entry.getValue(), jobId, executionId);
distributedCacheEntries.put(entry.getKey(), cp);
}
} catch(Exception e) {
throw new Exception(String.format("Exception while adding files to distributed cache of task %s (%s).", taskNameWithSubtask, executionId), e);
}
if(isCanceledOrFailed()) {
throw new CancelTaskException();
}
// ----------------------------------------------------------------
// call the user code initialization methods
// ----------------------------------------------------------------
TaskKvStateRegistry kvStateRegistry = kvStateService.createKvStateTaskRegistry(jobId, getJobVertexId());
/*************************************************
* TODO 第六步:
* 注释: 构建一个环境对象RuntimeEnvironment ,包装Task 执行过程中所需要的各种组件
*/
Environment env = new RuntimeEnvironment(jobId, vertexId, executionId, executionConfig, taskInfo, jobConfiguration, taskConfiguration,
userCodeClassLoader, memoryManager, ioManager, broadcastVariableManager, taskStateManager, aggregateManager, accumulatorRegistry,
kvStateRegistry, inputSplitProvider, distributedCacheEntries, consumableNotifyingPartitionWriters, inputGates, taskEventDispatcher,
checkpointResponder, operatorCoordinatorEventGateway, taskManagerConfig, metrics, this, externalResourceInfoProvider);
// Make sure the user code classloader is accessible thread-locally.
// We are setting the correct context class loader before instantiating the invokable
// so that it is available to the invokable during its entire lifetime.
executingThread.setContextClassLoader(userCodeClassLoader);
/*************************************************
* TODO -> 第七步:通过反射 获取启动类实例
* 这句代码里会涉及 SourceStreamTask或者StreamTask的初始化
* 取决于当前这个 ExecutionVertex 是属于哪一个Operator的
*
* 当前这个Task 必定属于某一个 ExecutionVertex , 都有一个启动类的成员变量
* 将来这个Task启动的到底是那种具体的Task
*
* invokable 类别:SourceStreamTask , OneInputStreamTask 。。。。。。
* 要去找: SourceStreamTask 和 OneInputStreamTask 的带 RuntimeEnviroment参数的构造方法
* -> SourceStreamTask OneInputStreamTask
*
*
* 注释: 获取到代码运行主类
* AbstractInvokable = invokable
*
* nameOfInvokableClass 在生成 StreamGraph 的时候,就已经确定了,见StreamGraph.addOperator 方法
* TODO Class<? extends AbstractInvokable> invokableClass =
* operatorFactory.isStreamSource() ? SourceStreamTask.class : OneInputStreamTask.class;
*
* nameOfInvokableClass 是 JobVertex 的 invokableClassName, AbstractInvokable = invokable
* 每一个 StreamNode 在添加的时候都会有一个 jobVertexClass 属性
* 对于一个 operator chain,就是 head operator 对应的 invokableClassName,见 StreamingJobGraphGenerator.createChain
* 通过反射创建 AbstractInvokable 对象
* 对于 Stream 任务而言,就是 StreamTask 的子类,SourceStreamTask、OneInputStreamTask、TwoInputStreamTask 等
*
*/
// now load and instantiate the task's invokable code
invokable = loadAndInstantiateInvokable(userCodeClassLoader, nameOfInvokableClass, env);
// ----------------------------------------------------------------
// actual task core work
// ----------------------------------------------------------------
// we must make strictly sure that the invokable is accessible to the cancel() call
// by the time we switched to running.
/**
* 第八步: 保存该启动实例
*/
this.invokable = invokable;
/*************************************************
* TODO 第九步
* 注释: 切换task状态 由 DEPLOYING 状态改成: RUNNING
*/
// switch to the RUNNING state, if that fails, we have been canceled/failed in the meantime
if(!transitionState(ExecutionState.DEPLOYING, ExecutionState.RUNNING)) {
throw new CancelTaskException();
}
/**
* TODO
* 第十步:Task 切换进入 RUNNING 状态,并告知 JobMaster
*/
// notify everyone that we switched to running
taskManagerActions.updateTaskExecutionState(new TaskExecutionState(jobId, executionId, ExecutionState.RUNNING));
// make sure the user code classloader is accessible thread-locally
executingThread.setContextClassLoader(userCodeClassLoader);
/*************************************************
* TODO -> 第十一步 : 启动Task的执行
* 注释: 运行任务 在流式应用程序中,都是 StreamTask 的子类
* 1、DataSourceTask
* 2、Operator
* 3、DataSinkTask
*
* -
* AbstractInvokable 是 Task 执行的主要逻辑,也是所有被执行的任务的基类,包括 Streaming 模式和 Batch 模式。
* 在 Streaming 模式下,所有任务都继承自 StreamTask,
* 包括 StreamTask 的子类包括 SourceStreamTask, OneInputStreamTask, TwoInputStreamTask,
* 以及用于迭代模式下的 StreamIterationHead 和 StreamIterationTail。
* -
* 每一个 StreamNode 在添加到 StreamGraph 的时候都会有一个关联的 jobVertexClass 属性,
* 这个属性就是该 StreamNode 对应的 StreamTask 类型;对于一个 OperatorChain 而言,它所对应的
* StreamTask 就是其 head operator 对应的 StreamTask。
*
* -> StreamTask
*/
// run the invokable
invokable.invoke();
// make sure, we enter the catch block if the task leaves the invoke() method due
// to the fact that it has been canceled
if(isCanceledOrFailed()) {
throw new CancelTaskException();
}
// ----------------------------------------------------------------
// finalization of a successful execution
// ----------------------------------------------------------------
/**
* 第十二步: ResultPartitionWriter 完成 所有 还未 flush 的数据 flush动作
*/
// finish the produced partitions. if this fails, we consider the execution failed.
for(ResultPartitionWriter partitionWriter : consumableNotifyingPartitionWriters) {
if(partitionWriter != null) {
partitionWriter.finish();
}
}
/*************************************************
* TODO 第十三步 : 状态更新
* 注释: 由 RUNNING 状态改成: FINISHED 状态
*/
// try to mark the task as finished
// if that fails, the task was canceled/failed in the meantime
if(!transitionState(ExecutionState.RUNNING, ExecutionState.FINISHED)) {
throw new CancelTaskException();
}
------------------------------
重点步骤: 第三步 : 注册输入和输出组件 启动 ResultPartitionWriter 和 InputGate
setupPartitionsAndGates
public static void setupPartitionsAndGates(ResultPartitionWriter[] producedPartitions, InputGate[] inputGates) throws IOException {
/**
* TODO
* 注册当前task的ResultPartition到启动task的taskManager之上的用来跟踪管理
* ResultPartition的ResultPartitionManager之中
*/
for(ResultPartitionWriter partition : producedPartitions) {
// ResultPartition TODO ConsumableNotifyingResultPartitionWriterDecorator
partition.setup();
}
/**
* TODO
* 为这个task的InputGate中的InputChannel分配BufferPool
*/
// InputGates must be initialized after the partitions, since during InputGate#setup
// we are requesting partitions
for(InputGate gate : inputGates) {
// TODO SingleInputGate
gate.setup();
}
}
3. SourceStreamTask 和 StreamTask 执行
在 beforeInvoke() 中,主要是初始化 OperatorChain,然后调用 init() 执行初始化,然后恢复状态,更
改 Task 自己的状态为 isRunning = true
在 runMailboxLoop() 中,主要是不停的处理 mail,这里是 FLink-1.10 的一项改进,使用了 mailbox
模型来处理任务
在 afterInvoke() 中,主要是完成 Task 要结束之前需要完成的一些细节,比如,把 Buffer 中比 flush 的
数据 flush 出来
最后,在 cleanUpInvoke() 主要做一些资源的释放,执行各种关闭动作:set false,interrupt,
shutdown,close,cleanup,dispose 等
invokable.invoke(); -> StreamTask.invoke
public final void invoke() throws Exception {
try {
/*************************************************
* TODO
* 注释: 第一步:初始化OperatorChain,然后调用init初始化,然后恢复状态,更改task的状态为 isRunning
*
* 如果是 SourceStreamTask 则启动 对接数据源的线程,执行响应的初始化
* 如果是 OneInputStreamTask 则需要对接上游的 Task 的 ResultPartition
*
*/
beforeInvoke();
// final check to exit early before starting to run
if(canceled) {
throw new CancelTaskException();
}
/*************************************************
* TODO 第二步 、 不停的处理mail 使用mailbox模型来处理任务
* 注释: Task 开始工作
* 执行这句代码的时候,还是在 Task 所在的那个线程中执行的。
*/
// let the task do its work
runMailboxLoop();
// if this left the run() method cleanly despite the fact that this was canceled,
// make sure the "clean shutdown" is not attempted
if(canceled) {
throw new CancelTaskException();
}
/*************************************************
* TODO
* 注释: 第三步: 完成task要结束之前需要完成的一些细节,比如:将Buffer中未flush的数据flush出来
*/
afterInvoke();
3、1 beforeInvoke()
protected void beforeInvoke() throws Exception {
disposedOperators = false;
LOG.debug("Initializing {}.", getName());
/*************************************************
* TODO
* 注释: 构建 OperatorChain 对象,里面会做很多事情
* 初始化 output 输出对象
* 主要做三件事情:
* 1、调用createStreamOutput()创建对应的下游输出RecordWriterOutput
* 2、调用createOutputCollector()将优化逻辑计划当中Chain中的StreamConfig(也就是数据)写入到第三步创建的RecordWriterOutput中
* 3、通过调用getChainedOutputs()输出结果RecordWriterOutput
*/
operatorChain = new OperatorChain<>(this, recordWriter);
/**
* TODO 注释: 获取 OperatorChain 的第一个 Operator
*
* 这个初始化后
* SourceSinkTask.processInput controller.suspendDefaultAction();这个代码才放开
* LegacySourceFunctionThread.run 才执行
*
* 可以认为 接收数据线程中,要用到的 headOpeartor 终于被初始化了。
* 其实到此为止,可以认为,在当前 OperatorChain 中要用到的各种组件都已经创建好了,
* 可以接收数据,然后开始流式处理了。
*/
headOperator = operatorChain.getHeadOperator();
/*************************************************
* TODO
* 注释: 执行 SourceStreamTask | OneInputStreamTask 的初始化
* 初始化 StreamOneInputProcessor , DataOutput, DataInput, CheckpointedInputGate
*
* 对于SourceStreamTask来说就是看source是不是ExternallyInducedSource
* 如果是就注册一个savepoint钩子
* 对于OneInputStreamTask来说,就是创建CheckpointedInputGate,StreamTaskNetworkOutput,
* StreamTaskNetworkInput,StreamOneInputProcessor
* 用来进行shuffer相关的数据传输
*
* 1、可能是 SourceStreamTask, 对于 SourceStreamTask 来说,只是注册一个 savepoint 钩子
* 2、也可能是 OneInputStreamTask
*
*/
// task specific initialization
init();
// save the work of reloading state, etc, if the task is already canceled
if(canceled) {
throw new CancelTaskException();
}
// -------- Invoke --------
LOG.debug("Invoking {}", getName());
// we need to make sure that any triggers scheduled in open() cannot be
// executed before all operators are opened
actionExecutor.runThrowing(() -> {
/*************************************************
* TODO
* 注释: 状态恢复入口
*/
// both the following operations are protected by the lock
// so that we avoid race conditions in the case that initializeState()
// registers a timer, that fires before the open() is called.
operatorChain.initializeStateAndOpenOperators(createStreamTaskStateInitializer());
/*************************************************
* TODO 核心
* 注释: 初始化 Mail
* 这个地方主要是初始化 InputGate 等输入相关的细节
*/
readRecoveredChannelState();
});
isRunning = true;
}
readRecoveredChannelState -> mainMailboxExecutor.execute(this::requestPartitions
-> inputGate.requestPartitions(); -> SingleInputGate.requestPartitions -> internalRequestPartitions
-> inputChannel.requestSubpartition
-> RemoteInputChannel.requestSubpartition
-> partitionRequestClient.requestSubpartition -> NettyPartitionRequestClient.requestSubpartition
/*************************************************
* TODO
* 注释: 发送请求: tcpChannel.writeAndFlush(request);
*
* -> NettyProtocol.PartitionRequestServerHandler.channelRead0 做对应处理
*/
if(delayMs == 0) {
ChannelFuture f = tcpChannel.writeAndFlush(request);
f.addListener(listener);
} else {
-> 服务端 PartitionRequestServerHandler.channelRead0
-> reader.requestSubpartitionView( -> partitionProvider.createSubpartitionView(
-> partition.createSubpartitionView -> subpartitions[index].createReadView(availabilityListener)
-> readView = new PipelinedSubpartitionView(this, availabilityListener);
ChainOperator 的初始化,首先会为每个 Operator 创建一个 RecordWriterOutput,再为每个
Operator 创建一个 OutputCollector。然后把每一个 Operator 都包装成 OperatorWrapper 放入
List allOpWrappers 集合中。最后调用linkOperatorWrappers(allOpWrappers);
方法以 逻辑正序 的方式来构建 StreamOperator 的链式关系。
然后是 init() 方法,对于 SourceStreamTask 来说,就是看 Source 是不是
ExternallyInducedSource,如果是,则注册一个 savepoint 钩子。对于 OneInputStreamTask 来说,
主要就是创建 CheckpointedInputGate,StreamTaskNetworkOutput,StreamTaskNetworkInput,
StreamOneInputProcessor 用来进行 Shuffle 相关的数据传输。
到此为止,Task 初始化和预执行相关的,都基本到位了,然后就开始从我们的 SourceStreamTask 的
HeadOperator 的数据接收线程,开始流式处理。
3、2 runMailboxLoop (Task 开始工作)
StreamTask.invoke -> runMailboxLoop -> mailboxProcessor.runMailboxLoop()
-> runMailboxStep -> runDefaultAction
-> -> StreamTask this.mailboxProcessor = new MailboxProcessor(this::processInput, mailbox, actionExecutor);
-> this::processInput (SourceStreamTask|OneInputStreamTask)
-> SourceStreamTask.processInput -> sourceThread.start() -> LegacySourceFunctionThread.run
-> headOperator.run(){
/*************************************************
* TODO
* 注释: 真正运行用户的 Operator
* 1、如果你使用:env.socketTextStream() 则调用: SocketTextStreamFunction
* 2、如果你使用:Kafka数据源, 则调用: FlinkKafkaConsumerBase
* ......
* function --> transformation ---> streamOperator
* headOperator.run();
*/
userFunction.run(ctx);
}
-> SocketTextStreamFunction.run -> ctx.collect(record); -> processAndCollect
-> output.collect( -> pushToOperator -> processElement
-> StreamMap.processElement
public void processElement(StreamRecord<IN> element) throws Exception {
/**
* element.getValue() 待处理的数据
* 1、userFunction.map(element.getValue()) 这是用户自定义的map逻辑,得到map处理之后的结果
* 2、然后计算完的结果替换掉当前Operator中的成员变量
* 3、然后被StreamMap这个StreamOperator继续收集
*
* -- OperatorChain.collector chain中
* --
* env.socketTextStream.map.keyby.sum
* map -> keyby
* 如果 当前这个Operator 是一个 OperatChain 中的最后一个 , 则此处的 outPut = RecordWriterOutput
* -> RecordWriterOutput
*/
output.collect(element.replace(userFunction.map(element.getValue())));
}
-> pushToRecordWriter -> emit -> RecordWriter.emit(){
protected void emit(T record, int targetChannel) throws IOException, InterruptedException {
checkErroneous();
// TODO_MA 注释: 序列化 为 ByteBuffer
serializer.serializeRecord(record);
// TODO_MA 注释: 将序列化器中的序列化结果写入目标 channel -> copyFromSerializerToTargetChannel
// Make sure we don't hold onto the large intermediate serialization buffer for too long
if (copyFromSerializerToTargetChannel(targetChannel)) {
// TODO_MA 注释: 清除序列化使用的buffer(这个是序列化时临时写入的byte[]),减少内存占用
serializer.prune();
}
}
}
-> copyFromSerializerToTargetChannel -> flushTargetPartition -> flush
-> PipelinedSubpartition
public void flush() {
final boolean notifyDataAvailable;
// TODO_MA 注释: 先校验,是否满足 数据可用要求
synchronized(buffers) {
if(buffers.isEmpty() || flushRequested) {
return;
}
// if there is more then 1 buffer, we already notified the reader
// (at the latest when adding the second buffer)
// TODO_MA 注释: 不是 checkpoint 阻塞,buffers大小为 1, 数据可用
// 执行checkpoint数据会阻塞
notifyDataAvailable = !isBlockedByCheckpoint && buffers.size() == 1 && buffers.peek().isDataAvailable();
// TODO_MA 注释: 如果 buffers 数量大于1,证明,之前已经执行了 notifyDataAvailable()
flushRequested = buffers.size() > 1 || notifyDataAvailable;
}
/*************************************************
* TODO
* 注释: 通知数据可用
*/
if(notifyDataAvailable) {
/**
* TODO
* 通知 readView ,数据可用了
* readView 是ResultSubPartition 的消费者视图对象
* 下游的一个task可能会消费上游多个task的某一个分区的数据
* 上有任意一个task 的任意一个分区叫做:ResultSubPartition
* 这个 ResultSubPartition 对应一个消费者: PipelinedSubpartitionView
*
*/
notifyDataAvailable();
}
}
-> notifyDataAvailable(){
/*************************************************
* TODO ->
* 注释:
* availabilityListener =
* 1. CreditBasedSequenceNumberingViewReader
* 2. LocalInputChannel
*/
availabilityListener.notifyDataAvailable();
}
/*
* 1. CreditBasedSequenceNumberingViewReader.notifyDataAvailable()
* 2. LocalInputChannel.notifyDataAvailable()
*/
// -> 1. CreditBasedSequenceNumberingViewReader.notifyDataAvailable()
-> readView.notifyDataAvailable(); -> requestQueue.notifyReaderNonEmpty(this);
加入队列后消费
->PartitionRequestQueue.userEventTriggered.enqueueAvailableReader -> writeAndFlushNextMessageIfPossible(){
// TODO 此处真正完成 从 NettyServer 写一条数据 到 NettyClient
// TODO
channel.writeAndFlush(msg).addListener(writeListener);
}
-> NettyProtocol.CreditBasedPartitionRequestClientHandler.channelRead -> decodeMsg
-> decodeBufferOrEvent -> RemoteInputChannel.onBuffer -> notifyChannelNonEmpty
-> queueChannel -> inputChannelsWithData.add(channel);
注释: 加入队列中
既然将 有数据可用的channel 加入到 inputChannelsWithData,
那就证明,一定有其他的什么角色来从这个队列中获取 可用的channel 来消费数据
-> StreamTask.new MailboxProcessor(this::processInput, mailbox, actionExecutor);
-> processInput -> inputProcessor.processInput();
-> StreamOneInputProcessor.processInput
-> StreamTaskNetworkInput.emitNext(){
processElement(deserializationDelegate.getInstance(), output); -> output.emitRecord(recordOrMark.asRecord());
-> OneInputStreamTask.StreamTaskNetworkOutput.emitRecord 输出 计算逻辑处理
/*************************************************
* TODO 核心
* 注释: 获取输入
*/
Optional<BufferOrEvent> bufferOrEvent = checkpointedInputGate.pollNext();
if(bufferOrEvent.isPresent()) {
// return to the mailbox after receiving a checkpoint barrier to avoid processing of
// data after the barrier before checkpoint is performed for unaligned checkpoint mode
if(bufferOrEvent.get().isEvent() && bufferOrEvent.get().getEvent() instanceof CheckpointBarrier) {
return InputStatus.MORE_AVAILABLE;
}
/*************************************************
* TODO
* 注释: 处理数据(读取到的数据,变成buffer,进行序列化)
*/
processBufferOrEvent(bufferOrEvent.get());
} else {
}
-----------------------------------------------------------------
// 2. LocalInputChannel.notifyDataAvailable()
-> notifyChannelNonEmpty -> queueChannel(){
/*************************************************
* TODO
* 注释: 加入队列中
* 既然将 有数据可用的channel 加入到 inputChannelsWithData,
* 那就证明,一定有其他的什么角色来从这个队列中获取 可用的channel 来消费数据
*/
inputChannelsWithData.add(channel);
/**
* ->
* 如果之前队列中没有channel,这个channel加入后,通知等待的线程 getChannel.inputChannelsWithData.wait
*/
inputChannelsWithData.notifyAll();
}
-> inputChannelsWithData.wait(); // 可反推到 this.mailboxProcessor = new MailboxProcessor(this::processInput, mailbox, actionExecutor);
StreamTask.processInput -> StreamOneInputProcessor.processInput -> StreamTaskNetworkInput.emitNext(){
/** 两个成分的代码
* 先执行currentRecordDeserializer == null把数据放入buffer processBufferOrEvent
* 再执行 currentRecordDeserializer != null 进行计算并发往下游 processElement
*/
while(true) {
// TODO 如果可以通过 currentRecordDeserializer 反序列化得来结果
// get the stream element from the deserializer
if(currentRecordDeserializer != null) {
// TODO_MA 注释: 进行 Record 的反序列化
DeserializationResult result = currentRecordDeserializer.getNextRecord(deserializationDelegate);
if(result.isBufferConsumed()) {
currentRecordDeserializer.getCurrentBuffer().recycleBuffer();
currentRecordDeserializer = null;
}
/*************************************************
* TODO
* 注释: 处理记录
*/
if(result.isFullRecord()) {
// TODO -> 计算逻辑处理 并发往下游
processElement(deserializationDelegate.getInstance(), output);
return InputStatus.MORE_AVAILABLE;
}
}
/*************************************************
* TODO
* 注释: 获取输入
* -->
*/
Optional<BufferOrEvent> bufferOrEvent = checkpointedInputGate.pollNext();
if(bufferOrEvent.isPresent()) {
// return to the mailbox after receiving a checkpoint barrier to avoid processing of
// data after the barrier before checkpoint is performed for unaligned checkpoint mode
if(bufferOrEvent.get().isEvent() && bufferOrEvent.get().getEvent() instanceof CheckpointBarrier) {
return InputStatus.MORE_AVAILABLE;
}
/*************************************************
* TODO -> 重点
* 注释: 处理数据(读取到的数据,变成buffer,进行序列化)
*/
processBufferOrEvent(bufferOrEvent.get());
} else {
if(checkpointedInputGate.isFinished()) {
checkState(checkpointedInputGate.getAvailableFuture().isDone(), "Finished BarrierHandler should be available");
return InputStatus.END_OF_INPUT;
}
return InputStatus.NOTHING_AVAILABLE;
}
}
}
//以上逻辑分三步
// 第一步 获取输入
checkpointedInputGate.pollNext() -> inputGate.pollNext(从缓冲区或者 InputGate 中拉取数据); -> getNextBufferOrEvent
-> waitAndGetNextData -> getChannel(){
/*************************************************
* TODO
* 注释: 如果现在还没有数据,就阻塞
*/
while(inputChannelsWithData.size() == 0) {
if(closeFuture.isDone()) {
throw new IllegalStateException("Released");
}
/*************************************************
* TODO
* 注释: 阻塞
*/
if(blocking) {
// TODO <--
inputChannelsWithData.wait();
} else {
availabilityHelper.resetUnavailable();
return Optional.empty();
}
}
}
// 第二步 处理数据(读取到的数据,变成buffer,进行序列化)
processBufferOrEvent(bufferOrEvent.get()); -> currentRecordDeserializer.setNextBuffer
// 第三步 计算逻辑处理 并发往下游
processElement(deserializationDelegate.getInstance(), output); -> output.emitRecord