本文的基础环境可以参考flink 1.10.1 java版本wordcount演示 (nc + socket)
groupwindow的处理方式是当窗口(不管是时间窗口,还是事件窗口)关闭的时候,才进行一次计算输出。
以事件时间为参考,以groupwindow的滚动窗口的方式,统计窗口范围内的数据,包括数据个数,平均值等。
1. 添加依赖
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.11</artifactId>
<version>1.10.1</version>
</dependency>
不同版本,这里可能需要添加到包不同
2. 程序代码
package com.demo.sql;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.Tumble;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
public class FlinkSqlGroupWindow {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.setParallelism(1);
// 1. 读取数据
DataStream<String> inputStream = env.readTextFile("data/sensor.txt");
// 2. 转换成POJO
DataStream<SensorData> dataStream = inputStream.map(line -> {
String[] fields = line.split(",");
return new SensorData(fields[0], new Long(fields[1]), new Double(fields[2]));
}).assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<SensorData>(Time.seconds(2)) {
@Override
public long extractTimestamp(SensorData element) {
return element.getDt() * 1000L;
}
});
// 3. 创建表环境
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 4. 基于流创建一张表
Table dataTable = tableEnv.fromDataStream(dataStream, "id, dt, temperature, rt.rowtime");
// dataTable.printSchema();
tableEnv.createTemporaryView("sensor", dataTable);
tableEnv.toAppendStream(dataTable, Row.class).print();
// 5. 窗口操作
// 5.1 Group Window
// table API
Table resultTable = dataTable.window(Tumble.over("10.seconds").on("rt").as("tw"))
.groupBy("id, tw")
.select("id, id.count, temperature.avg, tw.end");
// SQL
Table resultSqlTable = tableEnv.sqlQuery("select id, count(id) as cnt, avg(temperature) as avgTemp, tumble_end(rt, interval '10' second) " +
"from sensor group by id, tumble(rt, interval '10' second)");
dataTable.printSchema();
tableEnv.toAppendStream(resultTable, Row.class).print("result");
tableEnv.toRetractStream(resultSqlTable, Row.class).print("sql");
env.execute();
}
}
3. 辅助代码SensorData
package com.demo.sql;
public class SensorData {
private String id;
private Long dt;
private Double temperature;
public SensorData() {
}
public SensorData(String id, Long dt, Double temperature) {
this.id = id;
this.dt = dt;
this.temperature = temperature;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Long getDt() {
return dt;
}
public void setDt(Long dt) {
this.dt = dt;
}
public Double getTemperature() {
return temperature;
}
public void setTemperature(Double temperature) {
this.temperature = temperature;
}
@Override
public String toString() {
return "SensorData{" +
"id='" + id + '\'' +
", time=" + dt +
", temperature=" + temperature +
'}';
}
}
4. 测试数据
sensor_1,1547718199,35.8
sensor_6,1547718201,15.4
sensor_7,1547718202,6.7
sensor_10,1547718205,38.1
sensor_1,1547718207,36.3
sensor_1,1547718209,32.8
sensor_1,1547718212,37.1
5. 执行程序输出结果
root
|-- id: STRING
|-- dt: BIGINT
|-- temperature: DOUBLE
|-- rt: TIMESTAMP(3) *ROWTIME*
sensor_1,1547718199,35.8,2019-01-17 09:43:19.0
sensor_6,1547718201,15.4,2019-01-17 09:43:21.0
sensor_7,1547718202,6.7,2019-01-17 09:43:22.0
sensor_10,1547718205,38.1,2019-01-17 09:43:25.0
sensor_1,1547718207,36.3,2019-01-17 09:43:27.0
sensor_1,1547718209,32.8,2019-01-17 09:43:29.0
sensor_1,1547718212,37.1,2019-01-17 09:43:32.0
result> sensor_1,1,35.8,2019-01-17 09:43:20.0
sql> (true,sensor_1,1,35.8,2019-01-17 09:43:20.0)
result> sensor_6,1,15.4,2019-01-17 09:43:30.0
sql> (true,sensor_6,1,15.4,2019-01-17 09:43:30.0)
result> sensor_1,2,34.55,2019-01-17 09:43:30.0
sql> (true,sensor_1,2,34.55,2019-01-17 09:43:30.0)
result> sensor_10,1,38.1,2019-01-17 09:43:30.0
sql> (true,sensor_10,1,38.1,2019-01-17 09:43:30.0)
result> sensor_7,1,6.7,2019-01-17 09:43:30.0
sql> (true,sensor_7,1,6.7,2019-01-17 09:43:30.0)
result> sensor_1,1,37.1,2019-01-17 09:43:40.0
sql> (true,sensor_1,1,37.1,2019-01-17 09:43:40.0)
可以看到按id分组后的10秒滚动窗口,窗口结束时间是每个整数10秒。比如sensor_1,1547718207,36.3,2019-01-17 09:43:27.0
sensor_1,1547718209,32.8,2019-01-17 09:43:29.0
这两条数据都的id都是sensor_1,时间窗口结束时间都是2019-01-17 09:43:30.0,所以会进行合并统计,得到记录条数和平均值。
另外可以看出,采用Table API进行窗口统计和sql进行窗口统计的结果是完全一样的。
在统计结果输出时,toAppendStream只支持新增数据方式的统计输出,toRetractStream支持向支持新增和修改的统计输出。