Flink中Table API和SQL 完整使用下
一、操作案例
1、快速上手
package com.example.chapter11;
import com.example.chapter05.ClickSource;
import com.example.chapter05.Event;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import java.time.Duration;
import static org.apache.flink.table.api.Expressions.$;
public class SimpleTableExample {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
SingleOutputStreamOperator<Event> eventStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
Table eventTable = tableEnv.fromDataStream(eventStream);
Table resultTable = tableEnv.sqlQuery("select user, url, `timestamp` from " + eventTable);
Table resultTable2 = eventTable.select($("user"), $("url"))
.where($("user").isEqual("Alice"));
tableEnv.toDataStream(resultTable).print("result");
tableEnv.toDataStream(resultTable2).print("result2");
tableEnv.createTemporaryView("clickTable", eventTable);
Table aggResult = tableEnv.sqlQuery("select user,COUNT(url) as cnt from clickTable group by user");
tableEnv.toChangelogStream(aggResult).print("agg");
env.execute();
}
}
2、flink流处理环境
package com.example.chapter11;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import static org.apache.flink.table.api.Expressions.$;
public class CommonApiTest {
public static void main(String[] args) {
EnvironmentSettings settings = EnvironmentSettings.newInstance()
.inStreamingMode()
.useBlinkPlanner()
.build();
TableEnvironment tableEnv = TableEnvironment.create(settings);
String createDDL = "CREATE TABLE clickTable (" +
"user_name STRING, " +
"url STRING, " +
"ts BIGINT " +
") WITH (" +
" 'connector'= 'filesystem'," +
" 'path' = 'input/clicks.txt'," +
" 'format' = 'csv'" +
")";
tableEnv.executeSql(createDDL);
Table clickTable = tableEnv.from("clickTable");
Table resultTable = clickTable.where($("user_name").isEqual("Bob"))
.select($("user_name"), $("url"));
tableEnv.createTemporaryView("Result2", resultTable);
Table resultTable2 = tableEnv.sqlQuery("select user_name,url from Result2");
String createOutDDL = "CREATE TABLE outTable (" +
"url STRING, " +
"user_name STRING " +
") WITH (" +
"'connector'= 'filesystem'," +
"'path' = 'output'," +
"'format' = 'csv'" +
")";
tableEnv.executeSql(createOutDDL);
String creatPrintOutDDL = "CREATE TABLE printOutTable (" +
"url STRING, " +
"user_name STRING " +
") WITH (" +
"'connector'= 'print'" +
")";
tableEnv.executeSql(creatPrintOutDDL);
resultTable.executeInsert("outTable");
resultTable2.executeInsert("printOutTable");
}
}
3、统计每个用户的PV值
package com.example.chapter11;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
public class CommonApiTestCount {
public static void main(String[] args) {
EnvironmentSettings settings = EnvironmentSettings.newInstance()
.inStreamingMode()
.useBlinkPlanner()
.build();
TableEnvironment tableEnv = TableEnvironment.create(settings);
String createDDL = "CREATE TABLE clickTable (" +
"user_name STRING, " +
"url STRING, " +
"ts BIGINT " +
") WITH (" +
" 'connector'= 'filesystem'," +
" 'path' = 'input/clicks.txt'," +
" 'format' = 'csv'" +
")";
tableEnv.executeSql(createDDL);
Table aggResult = tableEnv.sqlQuery("select user_name,count(url) as cnt from clickTable group by user_name");
String creatPrintOutDDLUserPV = "CREATE TABLE printOutTable (" +
" user_name STRING, " +
" cnt BIGINT " +
") WITH (" +
" 'connector'= 'print'" +
")";
tableEnv.executeSql(creatPrintOutDDLUserPV);
aggResult.executeInsert("printOutTable");
}
}
4、窗口TOP N 统计一段时间内的(前两名)活跃用户
package com.example.chapter11;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
public class TopNExample {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
String createDDL = "CREATE TABLE clickTable (" +
"user_name STRING, " +
"url STRING, " +
"ts BIGINT, " +
"et AS TO_TIMESTAMP( FROM_UNIXTIME(ts / 1000) ), " +
"WATERMARK FOR et AS et - INTERVAL '1' SECOND " +
") WITH (" +
" 'connector'= 'filesystem'," +
" 'path' = 'input/clicks.txt'," +
" 'format' = 'csv'" +
")";
tableEnv.executeSql(createDDL);
Table topNResultTable = tableEnv.sqlQuery("select user_name, cnt, row_num " +
"FROM (" +
" SELECT *,ROW_NUMBER() OVER (" +
"order by cnt DESC" +
" ) AS row_num " +
" FROM (SELECT user_name,COUNT(url) AS cnt FROM clickTable GROUP BY user_name)" +
") WHERE row_num <=2");
String subQuery = "SELECT user_name,COUNT(url) AS cnt, window_start,window_end " +
"FROM Table (" +
"TUMBLE(TABLE clickTable,DESCRIPTOR(et),INTERVAL '10' SECOND)" +
")" +
"GROUP BY user_name,window_start,window_end";
Table windowTopN = tableEnv.sqlQuery("select user_name, cnt, row_num,window_end " +
"FROM (" +
" SELECT *,ROW_NUMBER() OVER (" +
" PARTITION BY window_start,window_end " +
" order by cnt DESC" +
" ) AS row_num " +
" FROM ( " + subQuery + " )) WHERE row_num <=2");
tableEnv.toChangelogStream(windowTopN).print("window TOP N: ");
env.execute();
}
}
5、聚合操作函数
package com.example.chapter11;
import com.example.chapter05.ClickSource;
import com.example.chapter05.Event;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import java.time.Duration;
import static org.apache.flink.table.api.Expressions.$;
public class TimeAndWindowsTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
String createDDL = "CREATE TABLE clickTable (" +
"user_name STRING, " +
"url STRING, " +
"ts BIGINT, " +
"et AS TO_TIMESTAMP( FROM_UNIXTIME(ts / 1000) ), " +
"WATERMARK FOR et AS et - INTERVAL '1' SECOND " +
") WITH (" +
" 'connector'= 'filesystem'," +
" 'path' = 'input/clicks.txt'," +
" 'format' = 'csv'" +
")";
tableEnv.executeSql(createDDL);
SingleOutputStreamOperator<Event> clickStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
Table clickTable = tableEnv.fromDataStream(clickStream, $("user"), $("url"), $("timestamp"),
$("et").rowtime());
Table aggTable = tableEnv.sqlQuery("select user_name,count(1) from clickTable group by user_name");
Table groupWindowResultTable = tableEnv.sqlQuery("select " +
"user_name,count(1) as cnt, " +
"TUMBLE_END(et,INTERVAL '10' SECOND) AS entT " +
"FROM clickTable " +
"group by " +
"user_name, " +
"TUMBLE(et,INTERVAL '10' SECOND )"
);
Table tumbleWindowResultTable = tableEnv.sqlQuery(
"select user_name,COUNT(1) as cnt, " +
" window_end as endT " +
"from TABLE( " +
"TUMBLE(TABLE clickTable,DESCRIPTOR(et),INTERVAL '10' SECOND )" +
")" +
"GROUP BY user_name,window_end,window_start");
Table hopWindowResultTable = tableEnv.sqlQuery(
"select user_name,COUNT(1) as cnt, " +
" window_end as endT " +
"from TABLE( " +
"HOP(TABLE clickTable,DESCRIPTOR(et),INTERVAL '5' SECOND ,INTERVAL '10' SECOND )" +
")" +
"GROUP BY user_name,window_end,window_start");
Table cumulateWindowResultTable = tableEnv.sqlQuery(
"select user_name,COUNT(1) as cnt, " +
" window_end as endT " +
"from TABLE( " +
"CUMULATE(TABLE clickTable,DESCRIPTOR(et),INTERVAL '5' SECOND ,INTERVAL '10' SECOND )" +
")" +
"GROUP BY user_name,window_end,window_start");
Table overWindowResultTable = tableEnv.sqlQuery("select user_name," +
" avg(ts) OVER(" +
" PARTITION BY user_name " +
"ORDER BY et " +
"ROWS BETWEEN 3 PRECEDING AND CURRENT ROW" +
") AS avg_ts " +
"from clickTable"
);
tableEnv.toDataStream(overWindowResultTable).print("over window: ");
env.execute();
}
}
6、UdfTest_AggregateFunction
package com.example.chapter11;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.AggregateFunction;
public class UdfTest_AggregateFunction {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
String createDDL = "CREATE TABLE clickTable (" +
"user_name STRING, " +
"url STRING, " +
"ts BIGINT, " +
"et AS TO_TIMESTAMP( FROM_UNIXTIME(ts / 1000) ), " +
"WATERMARK FOR et AS et - INTERVAL '1' SECOND " +
") WITH (" +
" 'connector'= 'filesystem'," +
" 'path' = 'input/clicks.txt'," +
" 'format' = 'csv'" +
")";
tableEnv.executeSql(createDDL);
tableEnv.createTemporarySystemFunction("WeighedAverage", WeighedAverage.class);
Table resultTable = tableEnv.sqlQuery("select user_name, WeighedAverage(ts, 1) as w_avg " +
" from clickTable group by user_name");
tableEnv.toDataStream(resultTable).print();
env.execute();
}
public static class WeightedAvgAccumulator {
public long sum = 0;
public int count = 0;
}
public static class WeighedAverage extends AggregateFunction<Long, WeightedAvgAccumulator> {
@Override
public Long getValue(WeightedAvgAccumulator accumulator) {
if (accumulator.count == 0)
return null;
else
return accumulator.sum / accumulator.count;
}
@Override
public WeightedAvgAccumulator createAccumulator() {
return new WeightedAvgAccumulator();
}
public void accumulate(WeightedAvgAccumulator accumulator, Long iValue, Integer iWeight) {
accumulator.sum += iValue * iWeight;
accumulator.count += iWeight;
}
}
}
7、调用UDF进行查询转换
package com.example.chapter11;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.ScalarFunction;
public class UdfTest_ScalarFunction {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
String createDDL = "CREATE TABLE clickTable (" +
"user_name STRING, " +
"url STRING, " +
"ts BIGINT, " +
"et AS TO_TIMESTAMP( FROM_UNIXTIME(ts / 1000) ), " +
"WATERMARK FOR et AS et - INTERVAL '1' SECOND " +
") WITH (" +
" 'connector'= 'filesystem'," +
" 'path' = 'input/clicks.txt'," +
" 'format' = 'csv'" +
")";
tableEnv.executeSql(createDDL);
tableEnv.createTemporaryFunction("MyHash", MyHashFunction.class);
Table resultTable = tableEnv.sqlQuery("select user_name, MyHash(user_name) from clickTable");
tableEnv.toDataStream(resultTable).print();
env.execute();
}
public static class MyHashFunction extends ScalarFunction {
public int eval(String str) {
return str.hashCode();
}
}
}
8、表聚合函数
package com.example.chapter11;
import com.example.chapter05.ClickSource;
import com.example.chapter05.Event;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.TableAggregateFunction;
import org.apache.flink.util.Collector;
import java.time.Duration;
import static org.apache.flink.table.api.Expressions.$;
import static org.apache.flink.table.api.Expressions.call;
public class UdfTest_TableAggregateFunction {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
SingleOutputStreamOperator<Event> eventStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
Table eventTable = tableEnv.fromDataStream(eventStream,
$("user"),
$("url"),
$("timestamp").as("ts"),
$("rt").rowtime());
tableEnv.createTemporaryView("EventTable", eventTable);
Table windowAggTable = tableEnv.sqlQuery("select user, count(url) as cnt, " +
"window_end " +
"from TABLE(" +
" TUMBLE( TABLE EventTable, DESCRIPTOR(rt), INTERVAL '10' SECOND )" +
")" +
"group by user," +
" window_start," +
" window_end");
tableEnv.createTemporaryView("AggTable", windowAggTable);
tableEnv.createTemporarySystemFunction("Top2", Top2.class);
Table resultTable = tableEnv.from("AggTable")
.groupBy($("window_end"))
.flatAggregate(call("Top2", $("cnt")).as("value", "rank"))
.select($("window_end"), $("value"), $("rank"));
tableEnv.toChangelogStream(resultTable).print();
env.execute();
}
public static class Top2Accumulator {
public Long first;
public Long second;
}
public static class Top2 extends TableAggregateFunction<Tuple2<Long, Integer>, Top2Accumulator> {
@Override
public Top2Accumulator createAccumulator() {
Top2Accumulator acc = new Top2Accumulator();
acc.first = Long.MIN_VALUE;
acc.second = Long.MIN_VALUE;
return acc;
}
public void accumulate(Top2Accumulator acc, Long value) {
if (value > acc.first) {
acc.second = acc.first;
acc.first = value;
} else if (value > acc.second) {
acc.second = value;
}
}
public void emitValue(Top2Accumulator acc, Collector<Tuple2<Long, Integer>> out) {
if (acc.first != Long.MIN_VALUE) {
out.collect(Tuple2.of(acc.first, 1));
}
if (acc.second != Long.MIN_VALUE) {
out.collect(Tuple2.of(acc.second, 2));
}
}
}
}
9、实现自定义的函数
package com.example.chapter11;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.TableFunction;
public class UdfTest_TableFunction {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
String createDDL = "CREATE TABLE clickTable (" +
"user_name STRING, " +
"url STRING, " +
"ts BIGINT, " +
"et AS TO_TIMESTAMP( FROM_UNIXTIME(ts / 1000) ), " +
"WATERMARK FOR et AS et - INTERVAL '1' SECOND " +
") WITH (" +
" 'connector'= 'filesystem'," +
" 'path' = 'input/clicks.txt'," +
" 'format' = 'csv'" +
")";
tableEnv.executeSql(createDDL);
tableEnv.createTemporaryFunction("MySplint", MySplint.class);
Table resultTable = tableEnv.sqlQuery("select user_name, url, word, length" +
" from clickTable, " +
"LATERAL TABLE( MySplint(url) ) AS T(word,length)");
tableEnv.toDataStream(resultTable).print();
env.execute();
}
public static class MySplint extends TableFunction<Tuple2<String, Integer>> {
public void eval(String str) {
String[] fields = str.split("\\?");
for (String field : fields) {
collect(Tuple2.of(field, field.length()));
}
}
}
}