package com.thoughtworks.window
import java.util.Properties
import com.thoughtworks.source.SensorReading
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
/**
* WaterMarkWindow
* 测试数据
* ID 时间 温度
* sensor_1 1547718100 35.8
* sensor_1 1547718101 34.8
* sensor_1 1547718102 33.8
* sensor_1 1547718103 32.8
* sensor_1 1547718104 31.8
* sensor_1 1547718105 30.8
* sensor_1 1547718106 29.8
* sensor_1 1547718107 28.8
* sensor_1 1547718108 27.8
* sensor_1 1547718109 26.8
* sensor_1 1547718110 25.8
* sensor_1 1547718111 20.8
* sensor_1 1547718112 19.8
* sensor_1 1547718113 18.8
* sensor_1 1547718114 17.8
* sensor_1 1547718115 16.8
* sensor_1 1547718116 15.8
*
* sensor_1 1547718100 23.1
* sensor_1 1547718101 24.1
* sensor_1 1547718102 25.1
* sensor_1 1547718103 26.1
* sensor_1 1547718104 27.1
* sensor_1 1547718105 28.1
* sensor_1 1547718106 29.1
* sensor_1 1547718107 30.2
* sensor_1 1547718108 31.2
* sensor_1 1547718109 19.2
* sensor_1 1547718110 18.2
* sensor_1 1547718111 17.2
* sensor_1 1547718112 16.2
* sensor_1 1547718113 15.2
* sensor_1 1547718114 14.2
* sensor_1 1547718115 13.2
* sensor_1 1547718116 12.2
* sensor_1 1547718117 11.2
* sensor_1 1547718118 8.2
* sensor_1 1547718119 9.2
*/
object WaterMarkWindow {
def main(args: Array[String]): Unit = {
// 1. 环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// !!!设置执行以EventTime为准
// EventTime 事件发生的时间
// ProcessingTime 处理消息的时间
// IngestionTime 进入系统的时间
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
// !!!设置默认的Watermark间隔
// env.getConfig.setAutoWatermarkInterval(5000)
// 2. source kafka
val properties = new Properties()
// kafka 集群地址和端口
properties.setProperty("bootstrap.servers", "node02:9092,node03:9092,node04:9092")
// 消费组
properties.setProperty("group.id", "flink")
// key 序列化
properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
// value 序列化
properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
// 自动维护偏移量
properties.setProperty("auto.offset.reset", "latest")
/**
* 第一个参数 tipic
* 第二个参数 用于在Kafka的字节消息和Flink对象之间进行转换的反/序列化器。
* 第三个参数 kafka的properties
*/
val stream: DataStream[String] = env.addSource(new FlinkKafkaConsumer011[String]("flink", new SimpleStringSchema(), properties))
// 3. transformation
val waterMarkStream: DataStream[SensorReading] = stream.map(data => {
val split: Array[String] = data.split(" ")
SensorReading(split{0}, split{1}.toLong,split{2}.toDouble)
})
// 升序数据分配时间戳
// .assignAscendingTimestamps(_.timeStamp*1000)
// 乱序数据分配时间戳和watermark
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[SensorReading](Time.milliseconds(1000)) {
override def extractTimestamp(data: SensorReading): Long = {
data.timeStamp * 1000
}
})
val result = waterMarkStream
.keyBy(_.id)
// 设置时间窗口
.window(SlidingEventTimeWindows.of(Time.seconds(5),Time.seconds(2)))
// 允许延迟时间
.allowedLateness(Time.milliseconds(2))
.reduce((data1, data2) => (SensorReading(data1.id,data1.timeStamp.min(data2.timeStamp), data1.temperature.min(data2.temperature))))
// 4. sink
result.print("stream").setParallelism(1)
// 5. execute
env.execute("API Test")
}
}
抛出一个问题,窗口长度为5S,滑动为2S,延迟设置为2S,为何窗口长度为9秒???
时间倒序输入长度为5S???
正序输入长度为9S???
感觉实现出来了但是对watermark的理解不够
想要源码的同学留邮箱即可