目录
5. 表聚合函数(Table Aggregate Functions)
一、简单示例与程序模板
1、一个简单示例
public static void main(String[] args) throws Exception {
// 获取流执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1. 读取数据源
SingleOutputStreamOperator<Event> eventStream = env
.fromElements(
new Event("Alice", "./home", 1000L),
new Event("Bob", "./cart", 1000L),
new Event("Alice", "./prod?id=1", 5 * 1000L),
new Event("Cary", "./home", 60 * 1000L),
new Event("Bob", "./prod?id=3", 90 * 1000L),
new Event("Alice", "./prod?id=7", 105 * 1000L)
);
// 2. 获取表环境
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 3. 将数据流转换成表
Table eventTable = tableEnv.fromDataStream(eventStream);
// 4. 用执行SQL 的方式提取数据
Table resultTable1 = tableEnv.sqlQuery("select url, user from " + eventTable);
// 5. 基于Table直接转换
Table resultTable2 = eventTable.select($("user"), $("url"))
.where($("user").isEqual("Alice"));
// 6. 将表转换成数据流,打印输出
tableEnv.toDataStream(resultTable1).print("result1");
tableEnv.toDataStream(resultTable2).print("result2");
// 执行程序
env.execute();
}
2、程序架构
public static void main(String[] args) {
// 创建表环境
TableEnvironment tableEnv = ...;
// 创建输入表,连接外部系统读取数据
tableEnv.executeSql("CREATE TEMPORARY TABLE inputTable ... WITH ( 'connector'
= ... )");
// 注册一个表,连接到外部系统,用于输出
tableEnv.executeSql("CREATE TEMPORARY TABLE outputTable ... WITH ( 'connector'
= ... )");
// 执行 SQL 对表进行查询转换,得到一个新的表
Table table1 = tableEnv.sqlQuery("SELECT ... FROM inputTable... ");
// 使用 Table API 对表进行查询转换,得到一个新的表
Table table2 = tableEnv.from("inputTable").select(...);
// 将得到的结果写入输出表
TableResult tableResult = table1.executeInsert("outputTable");
}
二、聚合查询
1、分组聚合
// 1. 分组聚合
Table aggTable = tableEnv.sqlQuery("SELECT user_name, COUNT(1) FROM clickTable GROUP BY user_name");
// 2. 分组窗口聚合
Table groupWindowResultTable = tableEnv.sqlQuery("SELECT " +
"user_name, " +
"COUNT(1) AS cnt, " +
"TUMBLE_END(et, INTERVAL '10' SECOND) as endT " +
"FROM clickTable " +
"GROUP BY " + // 使用窗口和用户名进行分组
" user_name, " +
" TUMBLE(et, INTERVAL '10' SECOND)" // 定义1小时滚动窗口
);
2、窗口聚合
在 Flink 的 Table API 和 SQL 中,窗口的计算是通过“窗口聚合”( window aggregation )来实现的。与分组聚合类似,窗口聚合也需要调用 SUM() 、 MAX() 、 MIN() 、 COUNT() 一类的聚合函数,通过 GROUP BY 子句来指定分组的字段。只不过窗口聚合时,需要将窗口信息作为分组 key 的一部分定义出来。1.13 版本开始使用了“窗口表值函数”(Windowing TVF),窗口本身返回的是就是一个表,所以窗口会出现在 FROM后面,GROUP BY 后面的则是窗口新增的字段 window_start 和 window_end。
// 3. 窗口聚合
// 3.1 滚动窗口
Table tumbleWindowResultTable = tableEnv.sqlQuery("SELECT user_name, COUNT(url) AS cnt, " +
" window_end AS endT " +
"FROM TABLE( " +
" TUMBLE( TABLE clickTable, DESCRIPTOR(et), INTERVAL '10' SECOND)" +
") " +
"GROUP BY user_name, window_start, window_end "
);
// 3.2 滑动窗口
Table hopWindowResultTable = tableEnv.sqlQuery("SELECT user_name, COUNT(url) AS cnt, " +
" window_end AS endT " +
"FROM TABLE( " +
" HOP( TABLE clickTable, DESCRIPTOR(et), INTERVAL '5' SECOND, INTERVAL '10' SECOND)" +
") " +
"GROUP BY user_name, window_start, window_end "
);
// 3.3 累积窗口
Table cumulateWindowResultTable = tableEnv.sqlQuery("SELECT user_name, COUNT(url) AS cnt, " +
" window_end AS endT " +
"FROM TABLE( " +
" CUMULATE( TABLE clickTable, DESCRIPTOR(et), INTERVAL '5' SECOND, INTERVAL '10' SECOND)" +
") " +
"GROUP BY user_name, window_start, window_end "
);
3、开窗(Over)聚合
// 4. 开窗聚合
Table overWindowResultTable = tableEnv.sqlQuery("SELECT user_name, " +
" avg(ts) OVER (" +
" PARTITION BY user_name " +
" ORDER BY et " +
" ROWS BETWEEN 3 PRECEDING AND CURRENT ROW" +
") AS avg_ts " +
"FROM clickTable");
SELECT user, ts,
COUNT(url) OVER (
PARTITION BY user
ORDER BY ts
RANGE BETWEEN INTERVAL '1' HOUR PRECEDING AND CURRENT ROW
) AS cnt
FROM EventTable
这里我们以 ts 作为时间属性字段,对 EventTable 中的每行数据都选取它之前 1 小时的所有数据进行聚合,统计每个用户访问 url 的总次数,并重命名为 cnt 。最终将表中每行的 user ,ts 以及扩展出 cnt 提取出来。
三、TopN
1、普通TopN
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 1. 在创建表的DDL中直接定义时间属性
String createDDL = "CREATE TABLE clickTable (" +
" `user` STRING, " +
" url STRING, " +
" ts BIGINT, " +
" et AS TO_TIMESTAMP( FROM_UNIXTIME(ts / 1000) ), " +
" WATERMARK FOR et AS et - INTERVAL '1' SECOND " +
") WITH (" +
" 'connector' = 'filesystem', " +
" 'path' = 'input/clicks.csv', " +
" 'format' = 'csv' " +
")";
tableEnv.executeSql(createDDL);
// 普通Top N,选取当前所有用户中浏览量最大的2个
Table topNResultTable = tableEnv.sqlQuery("SELECT user, cnt, row_num " +
"FROM (" +
" SELECT *, ROW_NUMBER() OVER (" +
" ORDER BY cnt DESC" +
" ) AS row_num " +
" FROM (SELECT user, COUNT(url) AS cnt FROM clickTable GROUP BY user)" +
") WHERE row_num <= 2");
tableEnv.toChangelogStream(topNResultTable).print("top 2: ");
env.execute();
}
2、窗口TopN
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 读取数据源,并分配时间戳、生成水位线
SingleOutputStreamOperator<Event> eventStream = env
.fromElements(
new Event("Alice", "./home", 1000L),
new Event("Bob", "./cart", 1000L),
new Event("Alice", "./prod?id=1", 25 * 60 * 1000L),
new Event("Alice", "./prod?id=4", 55 * 60 * 1000L),
new Event("Bob", "./prod?id=5", 3600 * 1000L + 60 * 1000L),
new Event("Cary", "./home", 3600 * 1000L + 30 * 60 * 1000L),
new Event("Cary", "./prod?id=7", 3600 * 1000L + 59 * 60 * 1000L)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
// 创建表环境
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 将数据流转换成表,并指定时间属性
Table eventTable = tableEnv.fromDataStream(
eventStream,
$("user"),
$("url"),
$("timestamp").rowtime().as("ts")
// 将timestamp指定为事件时间,并命名为ts
);
// 为方便在SQL中引用,在环境中注册表EventTable
tableEnv.createTemporaryView("EventTable", eventTable);
// 定义子查询,进行窗口聚合,得到包含窗口信息、用户以及访问次数的结果表
String subQuery =
"SELECT window_start, window_end, user, COUNT(url) as cnt " +
"FROM TABLE ( " +
"TUMBLE( TABLE EventTable, DESCRIPTOR(ts), INTERVAL '1' HOUR )) " +
"GROUP BY window_start, window_end, user ";
// 定义Top N的外层查询
String topNQuery =
"SELECT * " +
"FROM (" +
"SELECT *, " +
"ROW_NUMBER() OVER ( " +
"PARTITION BY window_start, window_end " +
"ORDER BY cnt desc " +
") AS row_num " +
"FROM (" + subQuery + ")) " +
"WHERE row_num <= 2";
// 执行SQL得到结果表
Table result = tableEnv.sqlQuery(topNQuery);
tableEnv.toDataStream(result).print();
env.execute();
}
四、自定义函数(UDF)
1. 整体调用流程
(1)注册函数
注册函数时需要调用表环境的
createTemporarySystemFunction()
方法,传入注册的函数名
以及
UDF
类的
Class
对象:
// 注册函数
tableEnv.createTemporarySystemFunction("MyFunction", MyFunction.class);
(2)使用 Table API 调用函数
在
Table API
中,需要使用
call()
方法来调用自定义函数:
tableEnv.from("MyTable").select(call("MyFunction", $("myField")));
(3)在 SQL 中调用函数
tableEnv.sqlQuery("SELECT MyFunction(myField) FROM MyTable");
2. 标量函数(Scalar Functions)
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 1. 自定义数据源,从流转换
SingleOutputStreamOperator<Event> eventStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
// 2. 将流转换成表
Table eventTable = tableEnv.fromDataStream(eventStream,
$("user"),
$("url"),
$("timestamp").rowtime().as("ts"));
tableEnv.createTemporaryView("EventTable", eventTable);
// 3. 注册自定义标量函数
tableEnv.createTemporarySystemFunction("MyHash", MyHash.class);
// 4. 调用UDF查询转换
Table resultTable = tableEnv.sqlQuery("select user, MyHash(user) from EventTable");
// 5. 输出到控制台
tableEnv.executeSql("create table output (" +
"uname STRING, " +
"myhash INT ) " +
"WITH (" +
"'connector' = 'print')");
resultTable.executeInsert("output");
}
// 自定义一个ScalarFunction
public static class MyHash extends ScalarFunction {
public int eval(String str){
return str.hashCode();
}
}
3. 表函数(Table Functions)
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 1. 自定义数据源,从流转换
SingleOutputStreamOperator<Event> eventStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
// 2. 将流转换成表
Table eventTable = tableEnv.fromDataStream(eventStream,
$("user"),
$("url"),
$("timestamp").rowtime().as("ts"));
tableEnv.createTemporaryView("EventTable", eventTable);
// 3. 注册自定义表函数
tableEnv.createTemporarySystemFunction("MySplit", MySplit.class);
// 4. 调用UDF查询转换
Table resultTable = tableEnv.sqlQuery("select user, url, word, length " +
"from EventTable, LATERAL TABLE( MySplit(url) ) AS T(word, length)");
// 5. 输出到控制台
tableEnv.executeSql("create table output (" +
"uname STRING, " +
"url STRING, " +
"word STRING, " +
"length INT) " +
"WITH (" +
"'connector' = 'print')");
resultTable.executeInsert("output");
}
// 自定义一个TableFunction,注意有泛型,这里输出的是两个字段,二元组
public static class MySplit extends TableFunction<Tuple2<String, Integer>>{
public void eval(String str){
String[] fields = str.split("\\?"); // 转义问号,以及反斜杠本身
for (String field : fields){
collect(Tuple2.of(field, field.length()));
}
}
}
4. 聚合函数(Aggregate Functions)
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 1. 自定义数据源,从流转换
SingleOutputStreamOperator<Event> eventStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
// 2. 将流转换成表
Table eventTable = tableEnv.fromDataStream(eventStream,
$("user"),
$("url"),
$("timestamp").as("ts"),
$("rt").rowtime());
tableEnv.createTemporaryView("EventTable", eventTable);
// 3. 注册自定义表函数
tableEnv.createTemporarySystemFunction("WeightedAverage", WeightedAverage.class);
// 4. 调用UDF查询转换,这里权重直接给1
Table resultTable = tableEnv.sqlQuery("select user, " +
" WeightedAverage(ts, 1) as weighted_avg " +
"from EventTable " +
"group by user");
// 5. 输出到控制台
tableEnv.executeSql("create table output (" +
"uname STRING, " +
"weighted_avg BIGINT) " +
"WITH (" +
"'connector' = 'print')");
resultTable.executeInsert("output");
}
// 单独定义一个累加器类型
public static class WeightedAvgAccumulator {
public long sum = 0; // 加权和
public int count = 0; // 数据个数
}
// 自定义一个AggregateFunction,求加权平均值
public static class WeightedAverage extends AggregateFunction<Long, WeightedAvgAccumulator>{
@Override
public Long getValue(WeightedAvgAccumulator accumulator) {
if (accumulator.count == 0)
return null; // 防止除数为0
else
return accumulator.sum / accumulator.count;
}
@Override
public WeightedAvgAccumulator createAccumulator() {
return new WeightedAvgAccumulator();
}
// 累加计算方法,类似于add
public void accumulate(WeightedAvgAccumulator accumulator, Long iValue, Integer iWeight){
accumulator.sum += iValue * iWeight; // 这个值要算iWeight次
accumulator.count += iWeight;
}
}
5. 表聚合函数(Table Aggregate Functions)
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 1. 自定义数据源,从流转换
SingleOutputStreamOperator<Event> eventStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
// 2. 将流转换成表
Table eventTable = tableEnv.fromDataStream(eventStream,
$("user"),
$("url"),
$("timestamp").as("ts"),
$("rt").rowtime());
tableEnv.createTemporaryView("EventTable", eventTable);
// 3. 开滚动窗口聚合,得到每个用户在每个窗口中的浏览量
Table windowAggTable = tableEnv.sqlQuery("select user, count(url) as cnt, " +
"window_end " +
"from TABLE(" +
" TUMBLE( TABLE EventTable, DESCRIPTOR(rt), INTERVAL '10' SECOND )" +
")" +
"group by user," +
" window_start," +
" window_end");
tableEnv.createTemporaryView("AggTable", windowAggTable);
// 4. 注册表聚合函数函数
tableEnv.createTemporarySystemFunction("Top2", Top2.class);
// 5. 在Table API中调用函数
Table resultTable = tableEnv.from("AggTable")
.groupBy($("window_end"))
.flatAggregate(call("Top2", $("cnt")).as("value", "rank"))
.select($("window_end"), $("value"), $("rank"));
// 6. 输出到控制台
tableEnv.toChangelogStream(resultTable).print();
env.execute();
}
// 聚合累加器的类型定义,包含最大的第一和第二两个数据
public static class Top2Accumulator {
public Long first;
public Long second;
}
// 自定义表聚合函数,查询一组数中最大的两个,返回值为(数值,排名)的二元组
public static class Top2 extends TableAggregateFunction<Tuple2<Long, Integer>, Top2Accumulator> {
@Override
public Top2Accumulator createAccumulator() {
Top2Accumulator acc = new Top2Accumulator();
acc.first = Long.MIN_VALUE; // 为方便比较,初始值给最小值
acc.second = Long.MIN_VALUE;
return acc;
}
// 每来一个数据调用一次,判断是否更新累加器
public void accumulate(Top2Accumulator acc, Long value) {
if (value > acc.first) {
acc.second = acc.first;
acc.first = value;
} else if (value > acc.second) {
acc.second = value;
}
}
// 输出(数值,排名)的二元组,输出两行数据
public void emitValue(Top2Accumulator acc, Collector<Tuple2<Long, Integer>> out) {
if (acc.first != Long.MIN_VALUE) {
out.collect(Tuple2.of(acc.first, 1));
}
if (acc.second != Long.MIN_VALUE) {
out.collect(Tuple2.of(acc.second, 2));
}
}
}