文章目录
一、flink sql 和table api的案例(入门)
1.文件流转成表,动态流转成表
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.scala._
import org.apache.flink.api.scala._
import org.apache.flink.table.api.{DataTypes, EnvironmentSettings, Table}
import org.apache.flink.table.descriptors.{Csv, FileSystem, Schema}
object FlinkTableExample {
def main(args: Array[String]): Unit = {
// 创建执行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
// 创建表环境, 使用blink planner
val settings = EnvironmentSettings
.newInstance()
// 流批统一
.useBlinkPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env, settings)
tableEnv
.connect(new FileSystem().path("/Users/yuanzuo/Desktop/flink-tutorial/FlinkSZ1128/src/main/resources/sensor.txt"))
.withFormat(new Csv()) // 按照csv文件格式解析文件
.withSchema( // 定义表结构
new Schema()
.field("id", DataTypes.STRING())
.field("timestamp", DataTypes.BIGINT())
.field("temperature", DataTypes.DOUBLE())
)
.createTemporaryTable("inputTable") // 创建临时表
val sensorTable: Table = tableEnv.from("inputTable") // 将临时表inputTable赋值到sensorTable
// 使用table api
val resultTable: Table = sensorTable
.select("id, temperature") // 查询`id`, `temperature` => (String, Double)
.filter("id = 'sensor_1'") // 过滤
resultTable
.toAppendStream[(String, Double)] // 追加流
.print()
// 使用flink sql的方式查询
val resultSqlTable: Table = tableEnv
.sqlQuery("select id, temperature from inputTable where id ='sensor_1'")
resultSqlTable
.toAppendStream[(String, Double)] // 追加流
// .print()
// 将DataStream转换成流表
val stream = env.addSource(new SensorSource)
val table = tableEnv.fromDataStream(stream, 'id, 'timestamp as 'ts, 'temperature as 'temp)
table
.select('id, 'temp)
.toAppendStream[(String, Double)]
.print()
env.execute()
}
}
二、滚动窗口,统计10秒内出现的每个sensor的个数
1.Table Api实现
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.scala._
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.table.api.{EnvironmentSettings, Tumble}
/**
* 滚动窗口,统计10秒内出现的每个sensor的个数
*/
object CountTempByTableApi {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = env
.addSource(new SensorSource)
.assignAscendingTimestamps(_.timestamp)
// 表相关代码
val settings = EnvironmentSettings
.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env, settings)
// 将流转换成动态表
val dataTable = tableEnv
// rowtime表示使用事件时间,as表示取别名
.fromDataStream(stream, 'id, 'timestamp.rowtime as 'ts, 'temperature as 'temp)
// 滚动窗口
// 取ts作为抽取的事件时间,别名取为w
.window(Tumble over 10.seconds on 'ts as 'w)
.groupBy('id, 'w) // keyby.timeWindow
.select('id, 'id.count) // 每个窗口有多少条数据
// 将动态表转换成流
dataTable
.toRetractStream[(String, Long)] // `id, id.count`; 撤回流
.print()
env.execute()
}
}
2.Flink SQL实现
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.scala._
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.table.api.{EnvironmentSettings, Tumble}
object CountTempBySQL {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = env
.addSource(new SensorSource)
.assignAscendingTimestamps(_.timestamp)
// 表相关代码
val settings = EnvironmentSettings
.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env, settings)
// 将流转换成动态表
// dataTable 这个是表名
val dataTable = tableEnv
.fromDataStream(stream, 'id, 'timestamp.rowtime as 'ts, 'temperature as 'temp)
val result = tableEnv
// TUMBLE这个是滚动窗口,HOP是滑动窗口(或者叫跳动窗口)
.sqlQuery("SELECT id, COUNT(id) FROM " + dataTable + " GROUP BY id, TUMBLE(ts, INTERVAL '10' SECOND)")
.toRetractStream[(String, Long)]
result.print() // DataStream才能打印
env.execute()
}
}
三、leftOuterJoinLateral 案例
1.Table Api实现,Flink SQL实现都在里面
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.scala._
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.table.api.EnvironmentSettings
import org.apache.flink.table.functions.TableFunction
import org.apache.flink.types.Row
object TableFunctionExample {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
// 在主流上设置并行度1
env.setParallelism(1)
// 设置事件时间
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = env
.fromElements("hello#world", "atguigu#zuoyuan")
// 表相关代码
val settings = EnvironmentSettings
.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build()
// 创建表环境
val tableEnv = StreamTableEnvironment.create(env, settings)
val split = new Split("#")
// 这个s,代表流中"hello#world", "atguigu#zuoyuan"这两个字符串
val dataTable = tableEnv.fromDataStream(stream, 's)
dataTable
// 为什么要进行join?因为要将s和split(s) join到一行
// .joinLateral(split('s) as ('word, 'length))
// 功能一样
.leftOuterJoinLateral(split('s) as ('word, 'length))
.select('s, 'word, 'length)
.toAppendStream[(String, String, Int)]
// .print()
// 2.Flink SQL实现,注册udf函数
tableEnv.registerFunction("split", new Split("#"))
tableEnv.createTemporaryView("t", dataTable)
tableEnv
// `T`是元组的意思
.sqlQuery(
"""
|SELECT s, word, length from
| t
| LEFT JOIN LATERAL TABLE(split(s)) AS T(word, length) ON TRUE""".stripMargin)
.toAppendStream[Row]
.print()
env.execute()
}
class Split(separator: String) extends TableFunction[(String, Int)] {
def eval(str: String): Unit = {
str.split(separator).foreach(word => collect((word, word.length)))
}
}
}
四、标量函数(Scalar Functions)
1.标量函数的定义
用户定义的标量函数,可以将0、1或多个标量值,映射到新的标量值。
为了定义标量函数,必须在org.apache.flink.table.functions中扩展基类Scalar Function,并实现(一个或多个)求值(evaluation,eval)方法。标量函数的行为由求值方法决定,求值方法必须公开声明并命名为eval(直接def声明,没有override)。求值方法的参数类型和返回类型,确定了标量函数的参数和返回类型。
在下面的代码中,我们定义自己的HashCode函数,在TableEnvironment中注册它,并在查询中调用它。
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.scala._
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.table.api.{EnvironmentSettings, Tumble}
import org.apache.flink.table.functions.ScalarFunction
object ScalarFunctionExample {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val stream = env
.addSource(new SensorSource)
.assignAscendingTimestamps(_.timestamp)
// 表相关代码
val settings = EnvironmentSettings
.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build()
val tableEnv = StreamTableEnvironment.create(env, settings)
val hashCode = new HashCode(10)
// 将流转换成动态表
val dataTable = tableEnv
.fromDataStream(stream, 'id, 'timestamp.rowtime as 'ts, 'temperature as 'temp) // rowtime表示事件时间
dataTable
.select('id, hashCode('id))
.toAppendStream[(String, Int)]
// .print()
// 注册udf函数
tableEnv.registerFunction("hashCode", new HashCode(10))
// tableEnv.createTemporaryView("t", dataTable, 'id)
tableEnv
.sqlQuery("SELECT id, hashCode(id) FROM " + dataTable)
.toAppendStream[(String, Int)]
.print()
env.execute()
}
class HashCode(val factor: Int) extends ScalarFunction {
def eval(s: String): Int = {
s.hashCode() * factor
}
}
}
124

被折叠的 条评论
为什么被折叠?



