八、TableAPI和FlinkSQL
简介
- Flink 对批处理和流处理,提供了统一的上层 API
- Table API 是一套内嵌在 Java 和 Scala 语言中的查询API,它允许以非常直观的方式组合来自一些关系运算符的查询
- Flink 的 SQL 支持基于实现了 SQL 标准的 Apache Calcite
simple example
package day7
import com.atguigu.apitest.SensorReading
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.api._
import org.apache.flink.table.api.scala._
object TableExample_study1 {
def main(args:Array[String]):Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
//读取数据创建dataStream
val inputStream : DataStream[String] = env.readTextFile("D:\\Flink\\20-Flink【www.hoh0.com】\\FlinkTutorial\\src\\main\\resources\\sensor.txt")
val dataStream : DataStream[SensorReading] = inputStream
.map(data => {
val dataArray = data.split(",")
SensorReading(dataArray(0),dataArray(1).toLong,dataArray(2).toDouble)
})
//创建表执行环境
val tableEnv : StreamTableEnvironment = StreamTableEnvironment.create(env)
//基于数据流,转换成一张表,然后进行操作
val dataTable : Table = tableEnv.fromDataStream(dataStream)
//调用TableAPI得到转换结果
val resultTable : Table = dataTable
.select("id,temperature")
.filter("id=='sensor_1'")
//或者直接写sql得到结果
val resultSqltable : Table = tableEnv
.sqlQuery("select id, temperature from "+dataTable + " where id = 'sensor_1'")
//转换为数据流,打印输出
//val resultStream : DataStream[(String,Double)] = resultTable.toAppendStream
val resultStream : DataStream[(String,Double)] = resultSqltable.toAppendStream[(String,Double)]
resultStream.print("result")
resultTable.printSchema()
/*输出结果
root
|-- id: STRING
|-- temperature: DOUBLE
result> (sensor_1,35.8)
result> (sensor_1,37.2)
result> (sensor_1,33.5)
result> (sensor_1,38.1)
*/
env.execute("table api test")
}
}
创建TableEnviroment
//1 创建表环境
//创建老版本的流查询环境
val settings : EnvironmentSettings = EnvironmentSettings.newInstance()
.useOldPlanner()
.inStreamingMode()
.build()
val tableEnv:StreamTableEnvironment = StreamTableEnvironment.create(env,settings)
//1.2 创建老版本的批式查询环境
val batchEnv:ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val batchTableEnv : BatchTableEnvironment = BatchTableEnvironment.create(batchEnv)
//1.3创建blink版本的流查询环境
val bsSettings =EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build()
val bsTableEnv = StreamTableEnvironment.create(env,bsSettings)
//1.4创建blink版本的批式查询环境
val bbSettings = EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inBatchMode()
.build()
val bbTableEnv = TableEnvironment.create(bbSettings)
表(Table)
- TableEnvironment 可以注册目录 Catalog,并可以基于 Catalog 注册表
- 表(Table)是由一个“标识符”(identifier)来指定的,由3部分组成:Catalog名、数据库(database)名和对象名
- 表可以是常规的,也可以是虚拟的(视图,View)
- 常规表(Table)一般可以用来描述外部数据,比如文件、数据库表或消息队列的数据,也可以直接从 DataStream转换而来
- 视图(View)可以从现有的表中创建,通常是 table API 或者 SQL 查询的一个结果集
从文件读取数据
//2.1连接到文件系统(Csv)
val filePath = "D:\\Flink\\20-Flink【www.hoh0.com】\\FlinkTutorial\\src\\main\\resources\\sensor.txt"
tableEnv.connect(new FileSystem().path(filePath))
.withFormat(new OldCsv())//定义读取数据之后的格式化方法
.withSchema(new Schema()
.field("id",DataTypes.STRING())
.field("timestamp",DataTypes.BIGINT())
.field("temperature",DataTypes.DOUBLE())
) //定义表结构
.createTemporaryTable("inputTable")//注册一张表
//switch to stream and print
val sensorTable : Table = tableEnv.from("inputTable")
sensorTable.toAppendStream[(String,Long,Double)].print()
从kafka读取数据
tableEnv.connect(new Kafka()
.version("0.11")
.topic("sensor")
.property("bootstrap.servers","hadoop103:9092")
.property("zookeeper.connect","hadoop103:2181")
)
.withFormat(new Csv() )
.withSchema(new Schema()
.field("id",DataTypes.STRING())
.field("timestamp",DataTypes.BIGINT())
.field("temp",DataTypes.DOUBLE()))
.createTemporaryTable("kafkatable")
//switch to stream and print
val sensorTable : Table = tableEnv.from("kafkatable")
sensorTable.toAppendStream[(String,Long,Double)].print()
env.execute("table api test")
表的查询(Table API)
简单查询
val sensorTable: Table = tableEnv.from("inputTable")
val resultTable: Table = sensorTable
.select('id, 'temperature)
.filter('id === "sensor_1")
SQL简单查询
val resultSqlTable: Table = tableEnv.sqlQuery(
"""
|select id,temperature
|from inputTable
|where id = 'sensor_1'
""".stripMargin
)
resultTable.toAppendStream[(String, Double)].print("result")
resultSqlTable.toAppendStream[(String,Double)].print("sql")
简单聚合,统计每一个传感器的温度
val aggResultTable: Table = sensorTable.groupBy('id)
.select('id, 'id.count as 'count)
resultTable.toAppendStream[(String, Double)].print("result")
aggResultTable.toRetractStream[(String,Long)].print("agg")
SQL实现简单聚合
val aggResultSqlTable: Table = tableEnv.sqlQuery("select id,count(id) as cnt from inputTable group by id")
resultTable.toAppendStream[(String, Double)].print("result")
aggResultSqlTable.toRetractStream[(String, Long)].print("agg")
表和流的转化
val settings = EnvironmentSettings.newInstance()
.useOldPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env,settings)
// TODO change DataStream to Table
val sensorTable : Table = tableEnv.fromDataStream(dataStream,'id,'timestamp as 'ts,'temperature as 'temp)
sensorTable.printSchema()
sensorTable.toAppendStream[(String,Long,Double)].print()
将数据输入到文件中
val settings = EnvironmentSettings.newInstance()
.useOldPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env,settings)
// TODO change DataStream to Table
val sensorTable : Table = tableEnv.fromDataStream(dataStream,'id,'timestamp as 'ts,'temperature as 'temp)
//TODO change table , get the result_table
val resultTable : Table = sensorTable
.select('id,'temp)
.filter('id==="sensor_1")
//TODO define a output_table ,that the Tablesink be written data
tableEnv.connect(new FileSystem().path("D:\\Flink\\20-Flink【www.hoh0.com】\\FlinkTutorial\\src\\main\\resources\\out.txt"))
.withFormat(new Csv())
.withSchema(new Schema()
.field("id",DataTypes.STRING())
.field("temperature",DataTypes.DOUBLE())
)
.createTemporaryTable("outputTable")
//write the result_table to table sink
resultTable.insertInto("outputTable")
更新模式
- 对于流式查询,需要声明如何在表和外部连接器之间执行转换
- 与外部系统交换的消息类型,由更新模式(Update Mode)指定
追加(Append)模式
–表只做插入操作,和外部连接器只交换插入(Insert)消息
撤回(Retract)模式
–表和外部连接器交换添加(Add)和撤回(Retract)消息
–插入操作(Insert)编码为 Add 消息;删除(Delete)编码为 Retract 消息;更新(Update)编码为上一条的 Retract 和下一条的 Add 消息
更新插入(Upsert)模式
更新和插入都被编码为 Upsert 消息;删除编码为 Delete 消息
向kafka输入数据
package day7
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.api.{DataTypes, EnvironmentSettings, Table}
import org.apache.flink.table.api.scala._
import org.apache.flink.table.descriptors.{Csv, Kafka, Schema}
object kafkaTableStudy {
def main(args: Array[String]): Unit = {
//创建环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val settings = EnvironmentSettings.newInstance()
.useOldPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env,settings)
//连接kafka并且注册输入表
tableEnv.connect(new Kafka()
.version("0.11")
.topic("sensor")
.property("bootstrap.servers","hadoop103:9092")
.property("cookeeper.connect","hadoop103:2181")
)
.withFormat(new Csv())
.withSchema(new Schema()
.field("id",DataTypes.STRING())
.field("timestamp",DataTypes.BIGINT())
.field("temperature",DataTypes.DOUBLE()))
.createTemporaryTable("kafkaInputTable")
//将输入的表转化为结果表
val resultStream = tableEnv.from("kafkaInputTable")
.select('id,'temperature)
.filter('id ==="sensor_1")
//连接kafka并且注册输出表
tableEnv.connect(new Kafka()
.version("0.11")
.topic("sinkTest")
.property("bootstrap.servers","hadoop103:9092")
.property("zookeeper.connect","hadoop103:2181")
)
.withFormat(new Csv())
.withSchema(new Schema()
.field("id",DataTypes.STRING())
.field("temperature",DataTypes.DOUBLE())
)
.createTemporaryTable("kafkaOutputTable")
//将结果表输入到输出表中
resultStream.insertInto("kafkaOutputTable")
env.execute("kafka job")
}
}
时间特性(Time Attributes)
- 基于时间的操作(比如 Table API 和 SQL 中窗口操作),需要定义相关的时间语义和时间数据来源的信息
- Table 可以提供一个逻辑上的时间字段,用于在表处理程序中,指示时间和访问相应的时间戳
- 时间属性,可以是每个表schema的一部分。一旦定义了时间属性,它就可以作为一个字段引用,并且可以在基于时间的操作中使用
- 时间属性的行为类似于常规时间戳,可以访问,并且进行计算
定义处理时间(Processing Time)
val resultTable : Table = tableEnv.fromDataStream(dataStream,'id,'timestamp,'temperature ,'pt.proctime)
resultTable.printSchema()
resultTable.toAppendStream[Row].print()
- 处理时间语义下,允许表处理程序根据机器的本地时间生成结果。它是时间的最简单概念。它既不需要提取时间戳,也不需要生成 watermark
- 由 DataStream 转换成表时指定
- 在定义Schema期间,可以使用.proctime,指定字段名定义处理时间字段
- 这个proctime属性只能通过附加逻辑字段,来扩展物理schema。因此,只能在schema定义的末尾定义它
定义事件事件(Event Time)
package day7
import com.atguigu.apitest.SensorReading
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.table.api._
import org.apache.flink.table.api.scala._
import org.apache.flink.types.Row
object TimeAndWindowStudy {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val inputStream : DataStream[String] = env.readTextFile("D:\\Flink\\20-Flink【www.hoh0.com】\\FlinkTutorial\\src\\main\\resources\\sensor.txt")
val dataStream : DataStream[SensorReading] = inputStream.map(
data => {
val dataArray = data.split(",")
SensorReading(dataArray(0),dataArray(1).toLong,dataArray(2).toDouble)
}
)
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[SensorReading](Time.seconds(1)) {
override def extractTimestamp(element: SensorReading): Long = element.timestamp*1000L
})
val settings = EnvironmentSettings.newInstance()
.useOldPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env,settings)
//val resultTable : Table = tableEnv.fromDataStream(dataStream,'id,'timestamp,'temperature ,'pt.proctime)
val resultTable : Table = tableEnv.fromDataStream(dataStream,'id,'timestamp.rowtime,'temperature)
resultTable.printSchema()
resultTable.toAppendStream[Row].print()
env.execute("time test")
}
}
- 事件时间语义,允许表处理程序根据每个记录中包含的时间生成结果。这样即使在有乱序事件或者延迟事件时,也可以获得正确的结果。
- 为了处理无序事件,并区分流中的准时和迟到事件;Flink 需要从事件数据中,提取时间戳,并用来推进事件时间的进展
- 定义事件时间,同样有三种方法:
- 由 DataStream 转换成表时指定
- 定义 Table Schema 时指定
- 在创建表的 DDL 中定义
窗口操作
Group Windows
package day8
import java.sql.Timestamp
import com.atguigu.apitest.SensorReading
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.table.api.{EnvironmentSettings, Over, Table, Tumble}
import org.apache.flink.table.api.scala._
import org.apache.flink.types.Row
object WindowStudy {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val inputStream = env.readTextFile("D:\\Flink\\20-Flink【www.hoh0.com】\\FlinkTutorial\\src\\main\\resources\\sensor.txt")
val dataStream : DataStream[SensorReading] = inputStream.map(
data => {
val dataArray = data.split(",")
SensorReading(dataArray(0),dataArray(1).toLong,dataArray(2).toDouble)
}
)
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[SensorReading](Time.seconds(1)) {
override def extractTimestamp(element: SensorReading): Long = element.timestamp * 1000L
}
)
val settings = EnvironmentSettings.newInstance()
.useOldPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env,settings)
val sensorTable: Table = tableEnv.fromDataStream(dataStream, 'id, 'timestamp.rowtime as 'ts, 'temperature)
val groupResultTable: Table = sensorTable
.window( Tumble over 10.seconds on 'ts as 'tw )
.groupBy('id, 'tw)
.select('id, 'id.count, 'tw.end)
groupResultTable.toRetractStream[(String, Long, Timestamp)].print("group result")
env.execute("window_job")
}
}
通过sql实现group窗口
tableEnv.createTemporaryView("sensor",sensorTable)
val groupResultTableSql : Table = tableEnv.sqlQuery(
"""
|select id,
|count(id),
|tumble_end(ts,interval '10' second)
|from sensor
|group by
|id,
|tumble(ts, interval '10' second)
""".stripMargin
)
//groupResultTable.toRetractStream[(String, Long, Timestamp)].print("group result")
groupResultTableSql.toAppendStream[Row].print("groupsql result")
env.execute("window_job")
- 滚动窗口要用 Tumble 类来定义
- 滑动窗口要用 Slide 类来定义
- 会话窗口要用 Session 类来定义
Over Windows
val overResultTable : Table = sensorTable
.window(Over partitionBy 'id orderBy 'ts preceding 2.rows as 'w)
.select('id,'ts,'id.count over 'w ,'temperature.avg over 'w)
overResultTable.toAppendStream[Row].print("overResultTable")
用sql实现over窗口
val overResultTableSql = tableEnv.sqlQuery(
"""
|select id, ts,
| count(id) over w,
| avg(temperature) over w
|from sensor
|window w as (
| partition by id
| order by ts
| rows between 2 preceding and current row
|)
|""".stripMargin
)
overResultTableSql.toAppendStream[Row].print("oversql")
- Over window 聚合是标准 SQL 中已有的(over 子句),可以在查询的 SELECT 子句中定义
- Over window 聚合,会针对每个输入行,计算相邻行范围内的聚合
- Over windows 使用 window(w:overwindows*)子句定义,并在 select()方法中通过别名来引用
无界Over Windows
- 可以在事件时间或处理时间,以及指定为时间间隔、或行计数的范围内,定义 Over windows
- 无界的 over window 是使用常量指定的
有界Over Windows
- 有界的 over window 是用间隔的大小指定的
九、函数(Function)
TableFunction(表函数)
- 用户定义的表函数,也可以将0、1或多个标量值作为输入参数;与标量函数不同的是,它可以返回任意数量的行作为输出,而不是单个值
- 为了定义一个表函数,必须扩展 org.apache.flink.table.functions 中的基类 TableFunction 并实现(一个或多个)求值方法
- 表函数的行为由其求值方法决定,求值方法必须是 public 的,并命名为 eval
UDF实例
package day8
import com.atguigu.apitest.SensorReading
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps._
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.table.api.EnvironmentSettings
import org.apache.flink.table.api.scala._
import org.apache.flink.table.functions.TableFunction
import org.apache.flink.types.Row
object TableFunctionStudy {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val inputStream = env.readTextFile("D:\\Flink\\20-Flink【www.hoh0.com】\\FlinkTutorial\\src\\main\\resources\\sensor.txt")
val dataStream : DataStream[SensorReading] = inputStream
.map(data => {
val dataArray = data.split(",")
SensorReading(dataArray(0),dataArray(1).toLong,dataArray(2).toDouble)
})
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[SensorReading](Time.seconds(1)) {
override def extractTimestamp(element: SensorReading): Long = element.timestamp*1000L
})
val settings = EnvironmentSettings.newInstance()
.useOldPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env,settings)
val sensorTable = tableEnv.fromDataStream(dataStream,'id,'timestamp.rowtime as 'ts,'temperature )
val split = new SplitStudy("_")
val resultTable = sensorTable
.joinLateral(split('id) as ('word,'length))//将id传入,分为word和word.length,分别作为word,length
.select('id,'ts,'word,'length)
resultTable.toAppendStream[Row].print("result")
env.execute("job")
}
}
class SplitStudy(seperatore:String) extends TableFunction[(String,Int)] {
def eval(str : String) : Unit ={
str.split(seperatore).foreach(
word => collect((word,word.length))
)
}
}
用SQL实现
tableEnv.createTemporaryView("sensor",sensorTable)
tableEnv.registerFunction("split",split)
val resultTableSql = tableEnv.sqlQuery(
"""
|select id,ts,word,length
|from
|sensor,lateral table(split(id)) as splitid(word,length)
|
|""".stripMargin
)
resultTableSql.toAppendStream[Row].print("result sql")
Table API和SQL中的函数区别
比较函数
SQL
value1 = value2
value1 > value2
Table API
ANY1 === ANY2
ANY1 > ANY2
逻辑函数
SQL
boolean1 OR boolean2
boolean IS FALSE
NOT boolean
Table API
BOOLEAN1||BOOLEAN2
BOOLEAN.isFalse
!BOOLEAN
算数函数
SQL
numeric1 + numeric2
POWER(numeric1,numeric2)
Table API
NUMERIC1 + NUMERIC2
NUMERIC1.power(NUMERIC2)
字符串函数
SQL
string1||string2
UPPER(string)
CHAR_LENGTH(string)
Table API
STRING1 + STRING2
STRING.upperCase()
STRING.charLength()
时间函数
SQL
DATE string
TIMESTAMP string
CURRENT_TIME
INTERVAL string range
Table API
STRING.toData
STRING.toTimestamp
currentTime()
NUMERIC.days
NUMERIC.minutes
聚合函数
SQL
COUNT(*)
SUM(expression)
RANK()
ROW_NUMBER()
Table API
FIELD.count
FIELD.sum()
用户自定义函数(UDF)
- 用户定义函数(User-defined Functions,UDF)是一个重要的特性,它们显著地扩展了查询的表达能力
- 在大多数情况下,用户定义的函数必须先注册,然后才能在查询中使用
- 函数通过调用 **registerFunction()**方法在 TableEnvironment 中注册。当用户定义的函数被注册时,它被插入到 TableEnvironment 的函数目录中,这样Table API 或 SQL 解析器就可以识别并正确地解释它
Scalar Function(标量函数)
- 用户定义的标量函数,可以将0、1或多个标量值,映射到新的标量值
- 为了定义标量函数,必须在 org.apache.flink.table.functions 中扩展基类Scalar Function,并实现(一个或多个)求值(eval)方法
- 标量函数的行为由求值方法决定,求值方法必须公开声明并命名为 eval
UDF
package day8
import com.atguigu.apitest.SensorReading
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.table.api.{EnvironmentSettings, Table}
import org.apache.flink.table.api.scala._
import org.apache.flink.table.functions.ScalarFunction
import org.apache.flink.types.Row
object ScalarFunctionStudy {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val inputStream = env.readTextFile("D:\\Flink\\20-Flink【www.hoh0.com】\\FlinkTutorial\\src\\main\\resources\\sensor.txt")
val dataStream : DataStream[SensorReading] = inputStream
.map(data => {
val dataArray = data.split(",")
SensorReading(dataArray(0),dataArray(1).toLong,dataArray(2).toDouble)
})
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[SensorReading](Time.seconds(1)) {
override def extractTimestamp(element: SensorReading): Long = element.timestamp
})
val settings = EnvironmentSettings.newInstance()
.useOldPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env,settings)
val sensorTable : Table = tableEnv.fromDataStream(dataStream,'id,'timestamp.rowtime as 'ts,'temperature)
val hashCode = new hashCodeStudy(10)
val resultTable = sensorTable
.select('id,'ts,hashCode('id))
resultTable.toAppendStream[Row].print("result")
env.execute("job")
}
}
//自定义标量函数
class hashCodeStudy(factor : Int) extends ScalarFunction{
def eval(str :String) : Int ={
str.hashCode*factor
}
}
用SQL实现
tableEnv.createTemporaryView("sensor",sensorTable)
tableEnv.registerFunction("hashCode",hashCode)
val resultTableSql = tableEnv.sqlQuery(
"""
|select id,ts,hashCode(id) from sensor
|""".stripMargin
)
resultTableSql.toAppendStream[Row].print("result sql")