文章主要介绍了flinksql 的工作流程,
1.to SqlNode 解析阶段,生成AST(抽象语法树)
2. SqlNode -> RelNode -> Operation 基于RBO,CBO做优化。
3.RelNode -> ExecNode 从逻辑计划转换为物理执行计划
4.ExecNode -> Transformation 物理执行计划转换为算子
5.构建StreamGraph,执行异步提交
接下来让我们跟进代码:
//todo 执行sql
@Override
public TableResult executeSql(String statement) {
//todo
// 1. Parse 生成抽象语法树
// 2. Validate 校验绑定,生成 operations,如果是查询则包含:RelNode
// create : CreateTableOperation
// insert : CatalogSinkModifyOperation
// query : PlannerQueryOperation
List<Operation> operations = parser.parse(statement);
if (operations.size() != 1) {
throw new TableException(UNSUPPORTED_QUERY_IN_EXECUTE_SQL_MSG);
}
//todo Optimize (relroot查询会生成) relRoot -> ExecNode 执行
return executeOperation(operations.get(0));
}
首先在executesql()方法中,主要做了2件事:
1.parse(..)
内部够建了calcite解析器用来生成SqlNode
其次如果是查询语句则将SqlNode构建RelNode,并封装到Operation中。具体如何匹配Operation在convert()方法内部实现。
@Override
public List<Operation> parse(String statement) {
//todo 获取calcite解析器
CalciteParser parser = calciteParserSupplier.get();
//todo 使用flinkPlannerImpl 作为 validator
FlinkPlannerImpl planner = validatorSupplier.get();
// parse the sql query
//todo 1. 解析 生成AST(抽象语法树) SQL–>SqlNode
SqlNode parsed = parser.parse(statement);
//todo 校验 sqloNode,sqlNode 转换为RelRoot tree, 然后封装在Operation中
//todo create : CreateTableOperation
//todo insert : CatalogSinkModifyOperation
//todo query : PlannerQueryOperation
Operation operation = SqlToOperationConverter.convert(planner, catalogManager, parsed)
.orElseThrow(() -> new TableException("Unsupported query: " + statement));
return Collections.singletonList(operation);
}
2.executeOperation(..)
其实就是将上文中的Operation内部的RelNode进行优化。其中:
RBO优化:实现:HepPlanner, 基于规则优化,包括:谓词下推,常量折叠,列裁剪等
CBO优化:实现:VolcanoPlanner 基于成本代价优化。
1.
private TableResult executeOperation(Operation operation) {
//todo 插入 ModifyOperation 子类: CatalogSinkModifyOperation
if (operation instanceof ModifyOperation) {
//todo relRoot -> ExecNode
return executeInternal(Collections.singletonList((ModifyOperation) operation));
//todo 创建表
} else if (operation instanceof CreateTableOperation) {
CreateTableOperation createTableOperation = (CreateTableOperation) operation;
if (createTableOperation.isTemporary()) {
catalogManager.createTemporaryTable(
createTableOperation.getCatalogTable(),
createTableOperation.getTableIdentifier(),
createTableOperation.isIgnoreIfExists());
} else {
//todo 不是临时表
catalogManager.createTable(
createTableOperation.getCatalogTable(),
createTableOperation.getTableIdentifier(),
createTableOperation.isIgnoreIfExists());
}
return TableResultImpl.TABLE_RESULT_OK;
2.executeInternal:
@Override
public TableResult executeInternal(List<ModifyOperation> operations) {
//todo relRoot -> ExecNode -> sql转换为代码
List<Transformation<?>> transformations = translate(operations);
//todo 封装jobName
List<String> sinkIdentifierNames = extractSinkIdentifierNames(operations);
String jobName = "insert-into_" + String.join(",", sinkIdentifierNames);
//todo 根据代码构建流图 streamGraph
Pipeline pipeline = execEnv.createPipeline(transformations, tableConfig, jobName);
...
3.translate:
//todo relRoot -> ExecNode
override def translate(
modifyOperations: util.List[ModifyOperation]): util.List[Transformation[_]] = {
if (modifyOperations.isEmpty) {
return List.empty[Transformation[_]]
}
// prepare the execEnv before translating
getExecEnv.configure(
getTableConfig.getConfiguration,
Thread.currentThread().getContextClassLoader)
overrideEnvParallelism()
//todo operation 获取内部的relNode
val relNodes = modifyOperations.map(translateToRel)
//todo 逻辑计划进行优化,RBO,CBO
val optimizedRelNodes = optimize(relNodes)
//todo 将 RelNode -> ExecNode, 生成物理执行计划
val execNodes = translateToExecNodePlan(optimizedRelNodes)
//todo ExecNode 转换为 Transformation 算子
translateToPlan(execNodes)
}
....
4.optimize:
override def optimize(roots: Seq[RelNode]): Seq[RelNode] = {
//todo 优化 以 RelNodeBlock 为单位进行优化, RBO,CBO
val sinkBlocks = doOptimize(roots)
//todo 获得优化后的逻辑计划
val optimizedPlan = sinkBlocks.map { block =>
val plan = block.getOptimizedPlan
require(plan != null)
plan
}
expandIntermediateTableScan(optimizedPlan)
}
...
5.doOptimize/optimizeTree/optimize:
def optimize(root: RelNode, context: OC): RelNode = {
//todo RBO FlinkHepProgram: ,CBO: FlinkVolcanoProgram
programNames.foldLeft(root) {
(input, name) =>
val program = get(name).getOrElse(throw new TableException(s"This should not happen."))
val start = System.currentTimeMillis()
//todo
val result = program.optimize(input, context)
val end = System.currentTimeMillis()
if (LOG.isDebugEnabled) {
LOG.debug(s"optimize $name cost ${end - start} ms.\n" +
s"optimize result: \n${FlinkRelOptUtil.toString(result)}")
}
result
}
}
...
接下来就是进行构建ExecNode,物理执行计划
并生成算子,创建StreamGraph,进行异步调用。在上述中的第三步,开始构建sink。
StreamExecLegacySink:
override protected def translateToPlanInternal(
//todo transformation 转换为 dataStream,
val dataStream = new DataStream(planner.getExecEnv, transformation)
//todo hive table sink入口 , 构建sink
val dsSink = streamTableSink.consumeDataStream(dataStream)
...
此刻,真正进入flink写入hive的核心代码:
//todo 处理数据流
@Override
public final DataStreamSink consumeDataStream(DataStream dataStream) {
checkAcidTable(catalogTable, identifier.toObjectPath());
String[] partitionColumns = getPartitionKeys().toArray(new String[0]);
String dbName = identifier.getDatabaseName();
String tableName = identifier.getObjectName();
try (HiveMetastoreClientWrapper client = HiveMetastoreClientFactory.create(
new HiveConf(jobConf, HiveConf.class), hiveVersion)) {
//todo 通过配置连接hive元数据库获取元数据信息
Table table = client.getTable(dbName, tableName);
StorageDescriptor sd = table.getSd();
HiveTableMetaStoreFactory msFactory = new HiveTableMetaStoreFactory(
jobConf, hiveVersion, dbName, tableName);
HadoopFileSystemFactory fsFactory = new HadoopFileSystemFactory(jobConf);
Class hiveOutputFormatClz = hiveShim.getHiveOutputFormatClass(
Class.forName(sd.getOutputFormat()));
boolean isCompressed = jobConf.getBoolean(HiveConf.ConfVars.COMPRESSRESULT.varname, false);
HiveWriterFactory recordWriterFactory = new HiveWriterFactory(
jobConf,
hiveOutputFormatClz,
sd.getSerdeInfo(),
tableSchema,
partitionColumns,
HiveReflectionUtils.getTableMetadata(hiveShim, table),
hiveShim,
isCompressed);
String extension = Utilities.getFileExtension(jobConf, isCompressed,
(HiveOutputFormat<?, ?>) hiveOutputFormatClz.newInstance());
//todo 配置落地文件格式
OutputFileConfig outputFileConfig = OutputFileConfig.builder()
.withPartPrefix("part-" + UUID.randomUUID().toString())
.withPartSuffix(extension == null ? "" : extension)
.build();
if (isBounded) {
//todo 批处理
FileSystemOutputFormat.Builder<Row> builder = new FileSystemOutputFormat.Builder<>();
builder.setPartitionComputer(new HiveRowPartitionComputer(
hiveShim,
jobConf.get(
HiveConf.ConfVars.DEFAULTPARTITIONNAME.varname,
HiveConf.ConfVars.DEFAULTPARTITIONNAME.defaultStrVal),
tableSchema.getFieldNames(),
tableSchema.getFieldDataTypes(),
partitionColumns));
builder.setDynamicGrouped(dynamicGrouping);
builder.setPartitionColumns(partitionColumns);
builder.setFileSystemFactory(fsFactory);
builder.setFormatFactory(new HiveOutputFormatFactory(recordWriterFactory));
builder.setMetaStoreFactory(
msFactory);
builder.setOverwrite(overwrite);
builder.setStaticPartitions(staticPartitionSpec);
builder.setTempPath(new org.apache.flink.core.fs.Path(
toStagingDir(sd.getLocation(), jobConf)));
builder.setOutputFileConfig(outputFileConfig);
return dataStream
.writeUsingOutputFormat(builder.build())
.setParallelism(dataStream.getParallelism());
} else {
//todo 流处理
org.apache.flink.configuration.Configuration conf = new org.apache.flink.configuration.Configuration();
catalogTable.getOptions().forEach(conf::setString);
HiveRowDataPartitionComputer partComputer = new HiveRowDataPartitionComputer(
hiveShim,
jobConf.get(
HiveConf.ConfVars.DEFAULTPARTITIONNAME.varname,
HiveConf.ConfVars.DEFAULTPARTITIONNAME.defaultStrVal),
tableSchema.getFieldNames(),
tableSchema.getFieldDataTypes(),
partitionColumns);
//todo 桶分配器 确定数据的分区 如何确定数据的分区,比如按时间,还是按照字段的值等等。
TableBucketAssigner assigner = new TableBucketAssigner(partComputer);
//todo 滚动策略
HiveRollingPolicy rollingPolicy = new HiveRollingPolicy(
conf.get(SINK_ROLLING_POLICY_FILE_SIZE).getBytes(),
conf.get(SINK_ROLLING_POLICY_ROLLOVER_INTERVAL).toMillis());
BucketsBuilder<RowData, String, ? extends BucketsBuilder<RowData, ?, ?>> builder;
if (userMrWriter) {
//hadoop native 方式
builder = bucketsBuilderForMRWriter(recordWriterFactory, sd, assigner, rollingPolicy, outputFileConfig);
LOG.info("Hive streaming sink: Use MapReduce RecordWriter writer.");
} else {
//todo 只支持 orc 和 parquet , 构建列式存储工厂类
Optional<BulkWriter.Factory<RowData>> bulkFactory = createBulkWriterFactory(partitionColumns, sd);
//todo 是否是列存储格式
if (bulkFactory.isPresent()) {
//todo 将流式数据写入文件系统
builder = StreamingFileSink.forBulkFormat(
new org.apache.flink.core.fs.Path(sd.getLocation()),
new FileSystemTableSink.ProjectionBulkFactory(bulkFactory.get(), partComputer))
.withBucketAssigner(assigner) //分区
.withRollingPolicy(rollingPolicy) //滚动策略
.withOutputFileConfig(outputFileConfig); //文件名配置
LOG.info("Hive streaming sink: Use native parquet&orc writer.");
} else {
//hadoop native 方式
builder = bucketsBuilderForMRWriter(recordWriterFactory, sd, assigner, rollingPolicy, outputFileConfig);
LOG.info("Hive streaming sink: Use MapReduce RecordWriter writer because BulkWriter Factory not available.");
}
}
//todo 构建提交分区文件算子StreamingFileCommitter
return FileSystemTableSink.createStreamingSink(
conf,
new org.apache.flink.core.fs.Path(sd.getLocation()),
getPartitionKeys(),
identifier,
overwrite,
dataStream,
builder,
msFactory,
fsFactory,
conf.get(SINK_ROLLING_POLICY_CHECK_INTERVAL).toMillis());
}
} catch (TException e) {
throw new CatalogException("Failed to query Hive metaStore", e);
} catch (IOException e) {
throw new FlinkRuntimeException("Failed to create staging dir", e);
} catch (ClassNotFoundException e) {
throw new FlinkHiveException("Failed to get output format class", e);
} catch (IllegalAccessException | InstantiationException e) {
throw new FlinkHiveException("Failed to instantiate output format instance", e);
}
}
总结
亲爱的flinksql !