本章内容对应官网:
https://nightlies.apache.org/flink/flink-docs-release-1.12/dev/table/streaming/time_attributes.html
1.时间属性介绍
- Time attributes can be part of every table schema.
- They are defined when creating a table from a CREATE TABLE DDL or a DataStream.
- Once a time attribute is defined, it can be referenced as a field and used in time-based operations.
- As long as a time attribute is not modified, and is simply forwarded from one part of a query to another, it remains a valid time attribute. Time attributes behave like regular timestamps, and are accessible for calculations.
- When used in calculations, time attributes are materialized and act as standard timestamps. However, ordinary timestamps cannot be used in place of, or be converted to, time attributes.
重点:
- 时间属性可以在用DDL创建Table或者从DataStream转Table的时候指定
- 时间属性在使用的时候就是一个普通字段,并且是标准时间戳类型
- 普通的时间戳和时间属性是不一样的
2.处理时间
1.1 DataStream到Table转换
https://www.bilibili.com/video/BV1oF411v79N?p=3
package No11_FlinkSQL.Time;
import Bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.DataTypes;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.descriptors.Csv;
import org.apache.flink.table.descriptors.FileSystem;
import org.apache.flink.table.descriptors.Schema;
import org.apache.flink.types.Row;
import java.text.SimpleDateFormat;
import static org.apache.flink.table.api.Expressions.$;
/**
* 流转换表的时候引入处理时间
*/
public class FlinkSQL_ProcessTime_StreamToTable {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
DataStreamSource<String> sourceDS = env.readTextFile("E:\\work\\bigdata\\flink\\src\\main\\resources\\sensort.txt");
SingleOutputStreamOperator<WaterSensor> sensorDS = sourceDS.map(new MapFunction<String, WaterSensor>() {
@Override
public WaterSensor map(String s) throws Exception {
String[] split = s.split(",");
return new WaterSensor(split[0], Long.parseLong(split[1]), Integer.parseInt(split[2]));
}
});
//引入时间语义
Table table = tableEnv.fromDataStream(sensorDS,$("id"),$("ts"),$("vc"),$("pt").proctime());
table.printSchema();
//root
// |-- id: STRING
// |-- ts: BIGINT
// |-- vc: INT
// |-- pt: TIMESTAMP(3) *PROCTIME*
DataStream<Row> res = tableEnv.toAppendStream(table, Row.class);
res.print();
env.execute();
}
}
1.2 在创建表的DDL中定义
https://www.bilibili.com/video/BV1oF411v79N?p=4&spm_id_from=pageDriver
package No11_FlinkSQL.Time;
import Bean.WaterSensor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableResult;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import static org.apache.flink.table.api.Expressions.$;
/**
* 流转换表的时候引入处理时间
*/
public class FlinkSQL_ProcessTime_DDL {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
DataStreamSource<String> sourceDS = env.readTextFile("E:\\work\\bigdata\\flink\\src\\main\\resources\\sensort.txt");
tableEnv.executeSql(
"create table sensor(" +
"id String," +
"ts bigint," +
"vc int," +
"pt_time as proctime())" +
"with(" +
"'connector' = 'filesystem'," +
"'path' = 'E:\\work\\bigdata\\flink\\src\\main\\resources\\sensort2.txt'," +
"'format' = 'csv')" );
TableResult tableResult = tableEnv.executeSql("select * from sensor where id = 'ws_001'");
Table table = tableEnv.sqlQuery("select * from sensor where id = 'ws_001'");
table.printSchema();
//root
// |-- id: STRING
// |-- ts: BIGINT
// |-- vc: INT
// |-- pt: TIMESTAMP(3) *PROCTIME*
tableResult.print();
env.execute();
}
}
3.事件时间
1.1 DataStream到Table转换
-
When converting a DataStream to a table, an event time attribute can be defined with the
.rowtime
property during schema definition. -
Timestamps and watermarks must have already been assigned in the DataStream being converted.
-
There are two ways of defining the time attribute when converting a DataStream into a Table. Depending on whether the specified .rowtime field name exists in the schema of the DataStream, the timestamp is either (1) appended as a new column, or it (2) replaces an existing column.
In either case, the event time timestamp field will hold the value of the DataStream event time timestamp.
// Option 1:
// extract timestamp and assign watermarks based on knowledge of the stream
DataStream<Tuple2<String, String>> stream = inputStream.assignTimestampsAndWatermarks(...);
// declare an additional logical field as an event time attribute
Table table = tEnv.fromDataStream(stream, $("user_name"), $("data"), $("user_action_time").rowtime());
// Option 2:
// extract timestamp from first field, and assign watermarks based on knowledge of the stream
DataStream<Tuple3<Long, String, String>> stream = inputStream.assignTimestampsAndWatermarks(...);
// the first field has been used for timestamp extraction, and is no longer necessary
// replace first field with a logical event time attribute
Table table = tEnv.fromDataStream(stream, $("user_action_time").rowtime(), $("user_name"), $("data"));
// Usage:
WindowedTable windowedTable = table.window(Tumble
.over(lit(10).minutes())
.on($("user_action_time"))
.as("userActionWindow"));
package No11_FlinkSQL.Time;
import Bean.WaterSensor;
import org.apache.flink.api.common.eventtime.*;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableResult;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import java.time.Duration;
import static org.apache.flink.table.api.Expressions.$;
/**
* 流转换表的时候引入处理时间
*/
public class FlinkSQL_EventTime_StreamToTable {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
DataStreamSource<String> sourceDS = env.readTextFile("D:\\IdeaProjects\\bigdata\\flink\\src\\main\\resources\\sensort.txt");
//todo 流转表的时候引入事件时间 必须先在流上提取时间戳和waterMark
WatermarkStrategy<WaterSensor> waterSensorWatermarkStrategy = WatermarkStrategy.<WaterSensor>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner<WaterSensor>() {
@Override
public long extractTimestamp(WaterSensor waterSensor, long l) {
return waterSensor.getTs() * 1000;
}
});
SingleOutputStreamOperator<WaterSensor> sensorDS = sourceDS.map(new MapFunction<String, WaterSensor>() {
@Override
public WaterSensor map(String s) throws Exception {
String[] split = s.split(",");
return new WaterSensor(split[0], Long.parseLong(split[1]), Integer.parseInt(split[2]));
}
}).assignTimestampsAndWatermarks(waterSensorWatermarkStrategy);
Table table = tableEnv.fromDataStream(sensorDS,
$("id"),
$("ts"),
$("vc"),
$("et").rowtime()); //会用提取的时间作为该字段
table.printSchema();
TableResult tableResult = tableEnv.executeSql("select * from " + table + " where id = 'ws_001'");
tableResult.print();
env.execute();
}
}
1.2在创建表的DDL中定义
CREATE TABLE user_actions (
user_name STRING,
data STRING,
user_action_time TIMESTAMP(3), --时间字段,是TIMESTAMP(3)类型的
-- 在时间字段的基础上定义watermark
WATERMARK FOR user_action_time AS user_action_time - INTERVAL '5' SECOND
) WITH (
...
);
SELECT TUMBLE_START(user_action_time, INTERVAL '10' MINUTE), COUNT(DISTINCT user_name)
FROM user_actions
GROUP BY TUMBLE(user_action_time, INTERVAL '10' MINUTE);
package No11_FlinkSQL.Time;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
public class FlinkSQL_EventTimeDDL {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
tableEnv.executeSql(" CREATE TABLE source_sensor (\n" +
" id String,\n" +
" ts bigint,\n" +
" vc int,\n" +
" rt as to_timestamp(from_unixtime(ts,'yyyy-MM-dd HH:mm:ss'))," +
" WATERMARK FOR rt AS rt - INTERVAL '5' SECOND\n" +
") WITH (\n" +
"'connector' = 'filesystem'," +
"'path' = 'E:\\work\\bigdata\\flink\\src\\main\\resources\\sensort.txt'," +
"'format' = 'csv')" );
Table source_sensor = tableEnv.from("source_sensor");
source_sensor.printSchema();
DataStream<Row> rowDataStream = tableEnv.toAppendStream(source_sensor, Row.class);
rowDataStream.print();
env.execute();
}
}