还有几分钟就登记了,目前在哈尔滨飞往北京的候机厅。由于晚上回去很晚,第二天忙活没时间更新文章,挤时间整理了一下。
Flink如何实现3个实时流同时join?整体思路就是:
•设置相同的时间类型•设置相同的时间窗口,这样就会到达相同窗口时,3个实时流会同时触发。
由于flink不支持3个实时流同时join,你需要先把2个实时流join完成的结果,再跟第三个实时流join。
import java.util
import SessionIdKeyedProcessFunction.MyTimeTimestampsAndWatermarks
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.{AssignerWithPeriodicWatermarks, AssignerWithPunctuatedWatermarks}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.watermark.Watermark
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.util.Collector
object FlinkWindow {
class MyTimeTimestampsAndWatermarks extends AssignerWithPeriodicWatermarks[(String,Int)] with Serializable{
//生成时间戳
val maxOutOfOrderness = 3500L // 3.5 seconds
var currentMaxTimestamp: Long = _
override def extractTimestamp(element: (String,Int), previousElementTimestamp: Long): Long = {
val timestamp = System.currentTimeMillis()
currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp)
timestamp
}
override def getCurrentWatermark(): Watermark = {
// return the watermark as current highest timestamp minus the out-of-orderness bound
new Watermark(currentMaxTimestamp - maxOutOfOrderness);
}
}
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime)
val input = env.socketTextStream("localhost", 9001)
val inputMap = input.flatMap(f => {
f.split("\\W+")
}).map(line =>(line ,1)).assignTimestampsAndWatermarks(new MyTimeTimestampsAndWatermarks())
inputMap.print()
val input1 = env.socketTextStream("localhost", 9002)
val inputMap1 = input1.flatMap(f => {
f.split("\\W+")
}).map(line =>(line ,1)).assignTimestampsAndWatermarks(new MyTimeTimestampsAndWatermarks())
inputMap1.print()
val input2 = env.socketTextStream("localhost", 9003)
val inputMap2 = input2.flatMap(f => {
f.split("\\W+")
}).map(line =>(line ,1)).assignTimestampsAndWatermarks(new MyTimeTimestampsAndWatermarks())
inputMap2.print()
val aa = inputMap.join(inputMap1).where(_._1).equalTo(_._1).window(TumblingProcessingTimeWindows.of(Time.seconds(6)))
.apply{(t1:(String,Int),t2:(String,Int), out : Collector[(String,Int,Int)])=>
out.collect(t1._1,t1._2,t2._2)
}
aa.print()
val cc = aa.join(inputMap2).where(_._1).equalTo(_._1).window(TumblingProcessingTimeWindows.of(Time.seconds(6)))
.apply{(t1:(String,Int,Int),t2:(String,Int), out : Collector[(String,Int,Int,Int)])=>
out.collect(t1._1,t1._2,t1._3,t2._2)
}
cc.print()
env.execute()
}
}
leftjoin,rightjoin由于flink官网没有明确指出实现方案,join算子无法实现,大家需要用cogroup来实现leftjoin和rightjoin,大家可以参考这个改一下就可以了
import util.source.StreamDataSource1;
import util.source.StreamDataSource;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
public class FlinkTumblingWindowsLeftJoinDemo {
public static void main(String[] args) throws Exception {
int windowSize = 10;
long delay = 5100L;
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
env.setParallelism(1);
// 设置数据源
DataStream<Tuple3<String, String, Long>> leftSource = env.addSource(new StreamDataSource()).name("Demo Source");
DataStream<Tuple3<String, String, Long>> rightSource = env.addSource(new StreamDataSource1()).name("Demo Source");
// 设置水位线
DataStream<Tuple3<String, String, Long>> leftStream = leftSource.assignTimestampsAndWatermarks(
new BoundedOutOfOrdernessTimestampExtractor<Tuple3<String, String, Long>>(Time.milliseconds(delay)) {
@Override
public long extractTimestamp(Tuple3<String, String, Long> element) {
return element.f2;
}
}
);
DataStream<Tuple3<String, String, Long>> rigjhtStream = rightSource.assignTimestampsAndWatermarks(
new BoundedOutOfOrdernessTimestampExtractor<Tuple3<String, String, Long>>(Time.milliseconds(delay)) {
@Override
public long extractTimestamp(Tuple3<String, String, Long> element) {
return element.f2;
}
}
);
// join 操作
leftStream.coGroup(rigjhtStream)
.where(new LeftSelectKey()).equalTo(new RightSelectKey())
.window(TumblingEventTimeWindows.of(Time.seconds(windowSize)))
.apply(new LeftJoin())
.print();
env.execute("TimeWindowDemo");
}
public static class LeftJoin implements CoGroupFunction<Tuple3<String, String, Long>, Tuple3<String, String, Long>, Tuple5<String, String, String, Long, Long>> {
@Override
public void coGroup(Iterable<Tuple3<String, String, Long>> leftElements, Iterable<Tuple3<String, String, Long>> rightElements, Collector<Tuple5<String, String, String, Long, Long>> out) {
for (Tuple3<String, String, Long> leftElem : leftElements) {
boolean hadElements = false;
for (Tuple3<String, String, Long> rightElem : rightElements) {
out.collect(new Tuple5<>(leftElem.f0, leftElem.f1, rightElem.f1, leftElem.f2, rightElem.f2));
hadElements = true;
}
if (!hadElements) {
out.collect(new Tuple5<>(leftElem.f0, leftElem.f1, "null", leftElem.f2, -1L));
}
}
}
}
public static class LeftSelectKey implements KeySelector<Tuple3<String, String, Long>, String> {
@Override
public String getKey(Tuple3<String, String, Long> w) {
return w.f0;
}
}
public static class RightSelectKey implements KeySelector<Tuple3<String, String, Long>, String> {
@Override
public String getKey(Tuple3<String, String, Long> w) {
return w.f0;
}
}
想看更多大厂技术干货分享?请关注下方公号,回复“spark”,“flink”,“机器学习”,“前端”即可获取海量学习资料。