本文总结Flink Table & SQL 中的双流Join。
-
Regular Join
-
Interval Join
-
Window Join
Regular Join
Regular Join: 常规Join。
在使用时注意以下几点:
-
默认情况下,需要将两个流的输入全部保存在State中。为限制状态无限增长,可通过
Query Configuration设置空闲状态保留时长。 -
目前仅支持等值连接。
-
Outer Join(Left Join/Right Join/Full Join)会产生
ADD/RETRACT消息,即RetractStream。在输出时注意一下。
代码示例
package com.bigdata.flink.streamJoin.regularJoin;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
/**
* Author: Wang Pei
* Summary:
* Regular Join
*/
public class RegularJoin {
public static void main(String[] args) throws Exception {
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build();
StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(streamEnv, settings);
// 流1
// 浏览流 {"userID":"user_1","eventType":"browse","eventTime":"2015-01-01 00:00:00"}
String kafkaBrowse = ""
+ "CREATE TABLE kafka_browse_log "
+ "( "
+ " userID STRING, "
+ " eventType STRING, "
+ " eventTime STRING, "
+ " proctime as PROCTIME() "
+ ") WITH ( "
+ " 'connector.type' = 'kafka', "
+ " 'connector.version' = '0.10', "
+ " 'connector.properties.bootstrap.servers' = 'kafka01:9092', "
+ " 'connector.properties.zookeeper.connect' = 'kafka01:2181', "
+ " 'connector.topic' = 'topic_1', "
+ " 'connector.properties.group.id' = 'c1', "
+ " 'connector.startup-mode' = 'latest-offset', "
+ " 'format.type' = 'json' "
+ ")";
tableEnv.sqlUpdate(kafkaBrowse);
tableEnv.toAppendStream(tableEnv.from("kafka_browse_log"), Row.class).print();
// 流2
// 用户基础信息变化流 {"userID":"user_1","userName":"name1","userAge":10,"userAddress":"Mars"}
String kafkaUser = ""
+ "CREATE TABLE kafka_user_change_log "
+ "( "
+ " userID STRING, "
+ " userName STRING, "
+ " userAge INT, "
+ " userAddress STRING "
+ ") WITH ( "
+ " 'connector.type' = 'kafka', "
+ " 'connector.version' = '0.10', "
+ " 'connector.properties.bootstrap.servers' = 'kafka01:9092', "
+ " 'connector.properties.zookeeper.connect' = 'kafka01:2181', "
+ " 'connector.topic' = 'topic_2', "
+ " 'connector.properties.group.id' = 'c1', "
+ " 'connector.startup-mode' = 'latest-offset', "
+ " 'format.type' = 'json' "
+ ")";
tableEnv.sqlUpdate(kafkaUser);
tableEnv.toAppendStream(tableEnv.from("kafka_user_change_log"), Row.class).print();
// INNER JOIN
//String execSQL = ""
// + "SELECT * "
// + "FROM kafka_browse_log "
// + "INNER JOIN kafka_user_change_log "
// + "ON kafka_browse_log.userID = kafka_user_change_log.userID ";
//tableEnv.toAppendStream(tableEnv.sqlQuery(execSQL), Row.class).print();
// LEFT JOIN
//String execSQL = ""
// + "SELECT * "
// + "FROM kafka_browse_log "
// + "LEFT JOIN kafka_user_change_log "
// + "ON kafka_browse_log.userID = kafka_user_change_log.userID ";
//tableEnv.toRetractStream(tableEnv.sqlQuery(execSQL), Row.class).print();
// RIGHT JOIN
//String execSQL = ""
// + "SELECT * "
// + "FROM kafka_browse_log "
// + "RIGHT JOIN kafka_user_change_log "
// + "ON kafka_browse_log.userID = kafka_user_change_log.userID ";
//tableEnv.toRetractStream(tableEnv.sqlQuery(execSQL), Row.class).print();
// FULL JOIN
String execSQL = ""
+ "SELECT * "
+ "FROM kafka_browse_log "
+ "FULL JOIN kafka_user_change_log "
+ "ON kafka_browse_log.userID = kafka_user_change_log.userID ";
tableEnv.toRetractStream(tableEnv.sqlQuery(execSQL), Row.class).print();
tableEnv.execute(RegularJoin.class.getSimpleName());
}
}
Interval Join
Interval Join: 间隔连接。场景示例: 一个流Join另一个流一段相对时间内的数据。
在使用时注意以下几点:
-
支持Process Time和Event Time。
-
对超出时间范围的数据会自动清除,避免State过大。
-
支持非等值连接。如
>=、>、<=、<、BETWEEN ... AND ...。
代码示例
package com.bigdata.flink.streamJoin.regularJoin;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
/**
* Author: Wang Pei
* Summary:
* Interval Join
*/
public class IntervalJoin {
public static void main(String[] args) throws Exception {
EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build();
StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(streamEnv, settings);
// 流1
// 浏览流 {"userID":"user_1","eventType":"browse","eventTime":"2015-01-01 00:00:00"}
String kafkaBrowse = ""
+ "CREATE TABLE kafka_browse_log "
+ "( "
+ " userID STRING, "
+ " eventType STRING, "
+ " eventTime STRING, "
+ " proctime as PROCTIME() "
+ ") WITH ( "
+ " 'connector.type' = 'kafka', "
+ " 'connector.version' = '0.10', "
+ " 'connector.properties.bootstrap.servers' = 'kafka01:9092', "
+ " 'connector.properties.zookeeper.connect' = 'kafka01:2181', "
+ " 'connector.topic' = 'topic_1', "
+ " 'connector.properties.group.id' = 'c1', "
+ " 'connector.startup-mode' = 'latest-offset', "
+ " 'format.type' = 'json' "
+ ")";
tableEnv.sqlUpdate(kafkaBrowse);
tableEnv.toAppendStream(tableEnv.from("kafka_browse_log"), Row.class).print();
// 流2
// 用户基础信息变化流 {"userID":"user_1","userName":"name1","userAge":10,"userAddress":"Mars"}
String kafkaUser = ""
+ "CREATE TABLE kafka_user_change_log "
+ "( "
+ " userID STRING, "
+ " userName STRING, "
+ " userAge INT, "
+ " userAddress STRING, "
+ " proctime as PROCTIME() "
+ ") WITH ( "
+ " 'connector.type' = 'kafka', "
+ " 'connector.version' = '0.10', "
+ " 'connector.properties.bootstrap.servers' = 'kafka01:9092', "
+ " 'connector.properties.zookeeper.connect' = 'kafka01:2181', "
+ " 'connector.topic' = 'topic_2', "
+ " 'connector.properties.group.id' = 'c1', "
+ " 'connector.startup-mode' = 'latest-offset', "
+ " 'format.type' = 'json' "
+ ")";
tableEnv.sqlUpdate(kafkaUser);
tableEnv.toAppendStream(tableEnv.from("kafka_user_change_log"), Row.class).print();
// Interval Join
String execSQL = ""
+ "SELECT * "
+ "FROM kafka_browse_log browse, "
+ " kafka_user_change_log `user` "
+ "WHERE "
+ " browse.userID = `user`.userID "
+ " AND `user`.proctime BETWEEN browse.proctime - INTERVAL '30' SECOND AND browse.proctime";
tableEnv.toAppendStream(tableEnv.sqlQuery(execSQL), Row.class).print();
tableEnv.execute(IntervalJoin.class.getSimpleName());
}
}
Window Join
Window Join: 两个流相同Key,且相同窗口内的数据做Join。
在使用时注意以下几点:
-
还没找到Flink DataStream中的Window Join(
Tumbling Window Join/Sliding Window Join/Session Window Join)好的SQL化表达。 -
Flink DataStream中的Window Join参考:
A. Flink DataStream Window Join: Inner Join、Left Join、Right Join
1141

被折叠的 条评论
为什么被折叠?



