16.FlinkTable & SQL
16.1.为什么需要Table & SQL
16.2.发展历史
16.3.API
16.4.核心概念
16.5.案例1
16.6.案例2
16.7.案例3
16.8.案例4
16.FlinkTable & SQL
16.1.为什么需要Table & SQL
16.2.发展历史
16.3.API
-
依赖
https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/table/
-
程序结构
https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/table/common.html
-
创建环境
https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/table/common.html
-
创建表
https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/table/common.html
-
查询
https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/table/common.html
-
整合DataStream
https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/table/common.html
16.4.核心概念
- 动态表/无界表
- 连续查询/需要借助State
16.5.案例1
将DataStream数据转Table和View然后使用sql进行统计查询
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import java.util.Arrays;
import static org.apache.flink.table.api.Expressions.$;
/**
* 演示Flink Table & SQL 案例- 将DataStream数据转Table
*
* @author tuzuoquan
* @date 2022/6/2 14:50
*/
public class Demo01 {
public static void main(String[] args) throws Exception {
//TODO 0.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
StreamTableEnvironment tenv = StreamTableEnvironment.create(env, settings);
//TODO 1.source
DataStream<Order> orderA = env.fromCollection(Arrays.asList(
new Order(1L, "beer", 3),
new Order(1L, "diaper", 4),
new Order(3L, "rubber", 2)));
DataStream<Order> orderB = env.fromCollection(Arrays.asList(
new Order(2L, "pen", 3),
new Order(2L, "rubber", 3),
new Order(4L, "beer", 1)));
//TODO 2.transformation
// 将DataStream数据转Table和View,然后查询
Table tableA = tenv.fromDataStream(orderA, $("user"), $("product"), $("amount"));
tableA.printSchema();
System.out.println(tableA);
System.out.println("==========================================");
tenv.createTemporaryView("tableB", orderB, $("user"), $("product"), $("amount"));
//查询:tableA中amount>2的和tableB中amount>1的数据最后合并
/*
select * from tableA where amount > 2
union
select * from tableB where amount > 1
*/
String sql = "select * from "+tableA+" where amount > 2 " +
"union " +
" select * from tableB where amount > 1";
Table resultTable = tenv.sqlQuery(sql);
resultTable.printSchema();
//UnnamedTable$1
System.out.println(resultTable);
System.out.println("==========================================");
//将Table转为DataStream
//DataStream<Order> resultDS = tenv.toAppendStream(resultTable, Order.class);//union all使用toAppendStream
//union使用toRetractStream
DataStream<Tuple2<Boolean, Order>> resultDS = tenv.toRetractStream(resultTable, Order.class);
//toAppendStream → 将计算后的数据append到结果DataStream中去
//toRetractStream → 将计算后的新的数据在DataStream原数据的基础上更新true或是删除false
//类似StructuredStreaming中的append/update/complete
//TODO 3.sink
resultDS.print();
//TODO 4.execute
env.execute();
}
@Data
@NoArgsConstructor
@AllArgsConstructor
public static class Order {
public Long user;
public String product;
public int amount;
}
}
输出结果:
root
|-- user: BIGINT
|-- product: STRING
|-- amount: INT
UnnamedTable$0
==========================================
root
|-- user: BIGINT
|-- product: STRING
|-- amount: INT
UnnamedTable$1
==========================================
8> (true,Demo01.Order(user=1, product=diaper, amount=4))
7> (true,Demo01.Order(user=1, product=beer, amount=3))
7> (true,Demo01.Order(user=2, product=pen, amount=3))
7> (true,Demo01.Order(user=2, product=rubber, amount=3))
Process finished with exit code 0
16.6.案例2
使用Table/DSL风格和SQL风格完成WordCount
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import static org.apache.flink.table.api.Expressions.$;
/**
* @author tuzuoquan
* @date 2022/6/2 16:02
*/
public class Demo02 {
public static void main(String[] args) throws Exception {
//TODO 0.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
StreamTableEnvironment tenv = StreamTableEnvironment.create(env, settings);
//TODO 1.source
DataStream<WC> wordsDS = env.fromElements(
new WC("Hello", 1),
new WC("World", 1),
new WC("Hello", 1)
);
//TODO 2.transformation
//将DataStream转为View或Table
tenv.createTemporaryView("t_words", wordsDS,$("word"), $("frequency"));
/**
* select word,sum(frequency) as frequency
* from t_words
* group by word
*/
String sql = "select word,sum(frequency) as frequency " +
"from t_words " +
"group by word";
//执行sql
Table resultTable = tenv.sqlQuery(sql);
//转为DataStream
DataStream<Tuple2<Boolean, WC>> resultDS = tenv.toRetractStream(resultTable, WC.class);
//toAppendStream → 将计算后的数据append到结果DataStream中去
//toRetractStream → 将计算后的新的数据在DataStream原数据的基础上更新true或是删除false
//类似StructuredStreaming中的append/update/complete
//TODO 3.sink
resultDS.print();
//new WC("Hello", 1),
//new WC("World", 1),
//new WC("Hello", 1)
//输出结果
//(true,Demo02.WC(word=Hello, frequency=1))
//(true,Demo02.WC(word=World, frequency=1))
//(false,Demo02.WC(word=Hello, frequency=1))
//(true,Demo02.WC(word=Hello, frequency=2))
//TODO 4.execute
env.execute();
}
@Data
@NoArgsConstructor
@AllArgsConstructor
public static class WC {
public String word;
public long frequency;
}
}
输出结构:
7> (true,Demo02.WC(word=Hello, frequency=1))
4> (true,Demo02.WC(word=World, frequency=1))
7> (false,Demo02.WC(word=Hello, frequency=1))
7> (true,Demo02.WC(word=Hello, frequency=2))
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import static org.apache.flink.table.api.Expressions.$;
/**
* Desc 演示Flink Table&SQL 案例- 使用SQL和Table两种方式做WordCount
*
* @author tuzuoquan
* @date 2022/6/2 17:41
*/
public class Demo02_2 {
public static void main(String[] args) throws Exception {
//TODO 0.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
StreamTableEnvironment tenv = StreamTableEnvironment.create(env, settings);
//TODO 1.source
DataStream<WC> wordsDS = env.fromElements(
new WC("Hello", 1),
new WC("World", 1),
new WC("Hello", 1)
);
//TODO 2.transformation
//将DataStream转为View或Table
Table table = tenv.fromDataStream(wordsDS);
//使用table风格查询/DSL
Table resultTable = table
.groupBy($("word"))
.select($("word"), $("frequency").sum().as("frequency"))
.filter($("frequency").isEqual(2));
//转换为DataStream
DataStream<Tuple2<Boolean, WC>> resultDS = tenv.toRetractStream(resultTable, WC.class);
//TODO 3.sink
resultDS.print();
//TODO 4.execute
env.execute();
}
@Data
@NoArgsConstructor
@AllArgsConstructor
public static class WC {
public String word;
public long frequency;
}
}
16.7.案例3
需求:
使用Flink SQL来统计5秒内,每个用户的订单总数、订单的最大金额、订单的最小金额
也就是每隔5秒统计最近5秒的每个用户的订单总数、订单的最大金额、订单的最小金额
上面的需求使用流处理的Window的基本时间的滚动窗口就可以搞定!
那么接下来使用FlinkTable&SQL-API来实现。
要求:使用事件时间+Watermaker+FlinkSQL和Table中的window来实现。
package day4;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.RichSourceFunction;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import java.time.Duration;
import java.util.Random;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import static org.apache.flink.table.api.Expressions.$;
/**
* 演示Flink Table&SQL 案例- 使用事件时间+Watermaker+window完成订单统计
*
* @author tuzuoquan
* @date 2022/6/2 18:12
*/
public class Demo03 {
public static void main(String[] args) throws Exception {
//TODO 0.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
StreamTableEnvironment tenv = StreamTableEnvironment.create(env, settings);
//TODO 1.source
DataStreamSource<Order> orderDS = env.addSource(new RichSourceFunction<Order>() {
private Boolean isRunning = true;
@Override
public void run(SourceContext<Order> ctx) throws Exception {
Random random = new Random();
while (isRunning) {
Order order = new Order(
UUID.randomUUID().toString(),
random.nextInt(3),
random.nextInt(101),
System.currentTimeMillis());
TimeUnit.SECONDS.sleep(1);
ctx.collect(order);
}
}
@Override
public void cancel() {
isRunning = false;
}
});
//TODO 2.transformation
//需求:事件时间+Watermarker+FlinkSQL和Table的window完成订单统计
DataStream<Order> orderDSWithWatermark = orderDS.assignTimestampsAndWatermarks(WatermarkStrategy.
<Order>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner((order, recordTimestamp) -> order.getCreateTime())
);
//将DataStream-->View/Table,注意:指定列的时候需要指定哪一列是时间
tenv.createTemporaryView("t_order",orderDSWithWatermark,$("orderId"), $("userId"),
$("money"), $("createTime").rowtime());
/*
select userId, count(orderId) as orderCount, max(money) as maxMoney,min(money) as minMoney
from t_order
group by userId,
tumble(createTime, INTERVAL '5' SECOND)
*/
String sql = "select userId, count(orderId) as orderCount, max(money) as maxMoney,min(money) as minMoney " +
"from t_order " +
"group by userId, " +
"tumble(createTime, INTERVAL '5' SECOND)";
//执行sql
Table resultTable = tenv.sqlQuery(sql);
DataStream<Tuple2<Boolean, Row>> resultDS = tenv.toRetractStream(resultTable, Row.class);
//TODO 3.sink
resultDS.print();
//TODO 4.execute
env.execute();
}
@Data
@AllArgsConstructor
@NoArgsConstructor
public static class Order {
private String orderId;
private Integer userId;
private Integer money;
private Long createTime;//事件时间
}
}
输出结果:
6> (true,2,2,85,44)
7> (true,0,1,66,66)
6> (true,2,2,54,27)
6> (true,1,2,86,58)
6> (true,2,1,42,42)
7> (true,0,2,96,95)
6> (true,1,2,27,17)
6> (true,2,3,68,49)
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.RichSourceFunction;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.Tumble;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import java.time.Duration;
import java.util.Random;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import static org.apache.flink.table.api.Expressions.$;
import static org.apache.flink.table.api.Expressions.lit;
/**
* 演示Flink Table&SQL 案例- 使用事件时间+Watermaker+window完成订单统计
*
* @author tuzuoquan
* @date 2022/6/2 18:12
*/
public class Demo03_2 {
public static void main(String[] args) throws Exception {
//TODO 0.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
StreamTableEnvironment tenv = StreamTableEnvironment.create(env, settings);
//TODO 1.source
DataStreamSource<Order> orderDS = env.addSource(new RichSourceFunction<Order>() {
private Boolean isRunning = true;
@Override
public void run(SourceContext<Order> ctx) throws Exception {
Random random = new Random();
while (isRunning) {
Order order = new Order(
UUID.randomUUID().toString(),
random.nextInt(3),
random.nextInt(101),
System.currentTimeMillis());
TimeUnit.SECONDS.sleep(1);
ctx.collect(order);
}
}
@Override
public void cancel() {
isRunning = false;
}
});
//TODO 2.transformation
//需求:事件时间+Watermarker+FlinkSQL和Table的window完成订单统计
DataStream<Order> orderDSWithWatermark = orderDS.assignTimestampsAndWatermarks(WatermarkStrategy.
<Order>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner((order, recordTimestamp) -> order.getCreateTime())
);
//将DataStream-->View/Table,注意:指定列的时候需要指定哪一列是时间
tenv.createTemporaryView("t_order",orderDSWithWatermark,$("orderId"), $("userId"),
$("money"), $("createTime").rowtime());
/*
select userId, count(orderId) as orderCount, max(money) as maxMoney,min(money) as minMoney
from t_order
group by userId,
tumble(createTime, INTERVAL '5' SECOND)
*/
Table resultTable = tenv.from("t_order")
.window(Tumble.over(lit(5).second())
.on($("createTime"))
.as("tumbleWindow"))
.groupBy($("tumbleWindow"), $("userId"))
.select(
$("userId"),
$("orderId").count().as("orderCount"),
$("money").max().as("maxMoney"),
$("money").min().as("minMoney")
);
DataStream<Tuple2<Boolean, Row>> resultDS = tenv.toRetractStream(resultTable, Row.class);
//TODO 3.sink
resultDS.print();
//TODO 4.execute
env.execute();
}
@Data
@AllArgsConstructor
@NoArgsConstructor
public static class Order {
private String orderId;
private Integer userId;
private Integer money;
private Long createTime;//事件时间
}
}
16.8.案例4
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableResult;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
/**
* 演示Flink Table & SQL 案例- 从Kafka
*
* @author tuzuoquan
* @date 2022/6/2 19:52
*/
public class Demo04 {
public static void main(String[] args) throws Exception {
//TODO 0.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build();
StreamTableEnvironment tenv = StreamTableEnvironment.create(env, settings);
//TODO 1.source
TableResult inputTable = tenv.executeSql(
"CREATE TABLE input_kafka (\n" +
" `user_id` BIGINT,\n" +
" `page_id` BIGINT,\n" +
" `status` STRING\n" +
") WITH (\n" +
" 'connector' = 'kafka',\n" +
" 'topic' = 'input_kafka',\n" +
" 'properties.bootstrap.servers' = 'node1:9092',\n" +
" 'properties.group.id' = 'testGroup',\n" +
" 'scan.startup.mode' = 'latest-offset',\n" +
" 'format' = 'json'\n" +
")"
);
//TODO 2.transformation
//编写sql过滤出状态为success的数据
String sql = "select * from input_kafka where status='success'";
Table etlResult = tenv.sqlQuery(sql);
//TODO 3.sink
DataStream<Tuple2<Boolean, Row>> resultDS = tenv.toRetractStream(etlResult, Row.class);
resultDS.print();
TableResult outputTable = tenv.executeSql(
"CREATE TABLE output_kafka (\n" +
" `user_id` BIGINT,\n" +
" `page_id` BIGINT,\n" +
" `status` STRING\n" +
") WITH (\n" +
" 'connector' = 'kafka',\n" +
" 'topic' = 'output_kafka',\n" +
" 'properties.bootstrap.servers' = 'node1:9092',\n" +
" 'format' = 'json',\n" +
" 'sink.partitioner' = 'round-robin'\n" +
")"
);
tenv.executeSql("insert into output_kafka select * from "+ etlResult);
//TODO 4.execute
env.execute();
}
@Data
@AllArgsConstructor
@NoArgsConstructor
public static class Order {
private String orderId;
private Integer userId;
private Integer money;
private Long createTime;//事件时间
}
}