1、前言
前面2部分主要是介绍以下2点:
- flink sql整体的执行流程大致为:sqlNode --> Operation --> RelNode --> 优化 --> execNode --> Transformation。
- 以及flink sql源码解析需要使用到的java spi。
现在具体来看看flink sql 在其内部转换的实现步骤,就是如何去调用连接器,主要是在sqlNode --> RelNode这一步。
当执行建表语句,主要是进行语法的校验。真正把输入源、输出源连接在一起执行的,还是通过insert语句。即当我们在定义输入表或者输出表,有错误的定义connecter='abc',也不会在执行建表语句时返回错误,而是在执行插入语句时报错。这里以一份简单的代码为例:
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(10)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val settings = EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env, settings)
tableEnv.getConfig.setLocalTimeZone(ofHours(8))
tableEnv.getConfig().getConfiguration().setBoolean("table.exec.emit.early-fire.enabled", true)
tableEnv.getConfig().getConfiguration().setString("table.exec.emit.early-fire.delay", "5000ms")
tableEnv.executeSql(
"""
|CREATE TABLE input (
| userId STRING,
| pageId STRING,
| sign STRING,
| proctime AS PROCTIME(), -- generates processing-time attribute using computed column
| eventTime TIMESTAMP(3),
| WATERMARK FOR eventTime AS eventTime - INTERVAL '5' SECOND -- defines watermark on ts column, marks ts as event-time attribute
|) WITH (
| 'connector' = 'kafka', -- using kafka connector
| 'topic' = 'flinksource', -- kafka topic
| 'scan.startup.mode' = 'latest-offset', -- reading from the latest
| 'properties.bootstrap.servers' = 'hadoop1:9092,hadoop2:9092,hadoop3:9092', -- kafka broker address
| 'format' = 'json' -- the data format is json
|)
|
""".stripMargin)
tableEnv.executeSql(
"""
|CREATE TABLE output (
| userId STRING,
| pageId STRING,
| cnt BIGINT,
| startTime timestamp,
| endTime timestamp
|) WITH (
| 'connector' = 'print'
|)
|
""".stripMargin)
tableEnv.executeSql(
"""
| insert into output
| select
| userId,
| pageId,
| count(*) as cnt,
| HOP_START(eventTime, interval '1' HOUR, interval '1' DAY) as startTime,
| HOP_END(eventTime, interval '1' HOUR, interval '1' DAY) as endTime
|from (
| select * from input where sign = 'error'
|) a group by userId,pageId, hop(eventTime, interval '1' HOUR, interval '1' DAY)
""".stripMargin)
env.execute("insertDemo")
}
2.