FlinkDataStream的Join记录
前言
本次主要记录flink javaapi实现DataStream的双流的join.
一、TumbleJoin
public class TestTumbleJoin {
public static void main(String[] args) throws Exception{
//1.创建流式执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//2.设置并行度
env.setParallelism(1);
//3.设置CK以及状态后端
// CkAndStateBacked.setCheckPointAndStateBackend(env,"FS");
//4.获取数据流
DataStreamSource<String> dataStreamSource1 = env.socketTextStream("xxxx", 7777);
DataStreamSource<String> dataStreamSource2 = env.socketTextStream("xxxx", 8888);
//5.map处理
SingleOutputStreamOperator<OrderEvent1> dataStream1 = dataStreamSource1.map(new MapFunction<String, OrderEvent1>() {
@Override
public OrderEvent1 map(String value) throws Exception {
String[] s = value.split(",");
Long ts = DateTimeUtil.toTs(s[2]);
return new OrderEvent1(s[0], Integer.parseInt(s[1]), ts);
}
}).assignTimestampsAndWatermarks(WatermarkStrategy.<OrderEvent1>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<OrderEvent1>() {
@Override
public long extractTimestamp(OrderEvent1 element, long recordTimestamp) {
return element.getTimestamp();
}
}));
SingleOutputStreamOperator<OrderEvent2> dataStream2 = dataStreamSource2.map(new MapFunction<String, OrderEvent2>() {
@Override
public OrderEvent2 map(String value) throws Exception {
String[] s = value.split(",");
Long ts = DateTimeUtil.toTs(s[2]);
return new OrderEvent2(s[0], Integer.parseInt(s[1]), ts);
}
}).assignTimestampsAndWatermarks(WatermarkStrategy.<OrderEvent2>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<OrderEvent2>() {
@Override
public long extractTimestamp(OrderEvent2 element, long recordTimestamp) {
return element.getTimestamp();
}
}));
//6双流join
DataStream<String> dataStream = dataStream1.join(dataStream2)
.where(OrderEvent1::getOrderId)
.equalTo(OrderEvent2::getOrderId)
.window(TumblingEventTimeWindows.of(Time.minutes(2)))
.apply(new FlatJoinFunction<OrderEvent1, OrderEvent2, String>() {
@Override
public void join(OrderEvent1 first, OrderEvent2 second, Collector<String> out) throws Exception {
out.collect(first.getOrderId() +"---" +first.getPrice() +"---"+ first.getTimestamp() +"---"+ second.getOrderId() +"---"+ second.getCount() +"---"+ second.getTimestamp());
}
});
//7打印输出
dataStream.print();
//8 执行
env.execute();
}
}
需要注意的是:这里我们输入的数据是事件时间推移的,因此需要两边的流都推进时间,才会看到关联的数据!
二、SlideWindowJoin
public class TestSlideWindow {
public static void main(String[] args) throws Exception{
//1.创建流式执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//2.设置并行度
env.setParallelism(1);
//3.设置CK以及状态后端
// CkAndStateBacked.setCheckPointAndStateBackend(env,"FS");
//4.获取数据流
DataStreamSource<String> dataStreamSource1 = env.socketTextStream("xxxx", 7777);
DataStreamSource<String> dataStreamSource2 = env.socketTextStream("xxxx", 8888);
//5.map处理
SingleOutputStreamOperator<OrderEvent1> dataStream1 = dataStreamSource1.map(new MapFunction<String, OrderEvent1>() {
@Override
public OrderEvent1 map(String value) throws Exception {
String[] s = value.split(",");
Long ts = DateTimeUtil.toTs(s[2]);
return new OrderEvent1(s[0], Integer.parseInt(s[1]), ts);
}
}).assignTimestampsAndWatermarks(WatermarkStrategy.<OrderEvent1>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<OrderEvent1>() {
@Override
public long extractTimestamp(OrderEvent1 element, long recordTimestamp) {
return element.getTimestamp();
}
}));
SingleOutputStreamOperator<OrderEvent2> dataStream2 = dataStreamSource2.map(new MapFunction<String, OrderEvent2>() {
@Override
public OrderEvent2 map(String value) throws Exception {
String[] s = value.split(",");
Long ts = DateTimeUtil.toTs(s[2]);
return new OrderEvent2(s[0], Integer.parseInt(s[1]), ts);
}
}).assignTimestampsAndWatermarks(WatermarkStrategy.<OrderEvent2>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<OrderEvent2>() {
@Override
public long extractTimestamp(OrderEvent2 element, long recordTimestamp) {
return element.getTimestamp();
}
}));
//6双流join
DataStream<String> dataStream = dataStream1.join(dataStream2)
.where(OrderEvent1::getOrderId)
.equalTo(OrderEvent2::getOrderId)
.window(SlidingEventTimeWindows.of(Time.seconds(10),Time.seconds(5)))
.apply(new FlatJoinFunction<OrderEvent1, OrderEvent2, String>() {
@Override
public void join(OrderEvent1 first, OrderEvent2 second, Collector<String> out) throws Exception {
out.collect(first.getOrderId() +"---" +first.getPrice() +"---"+ first.getTimestamp() +"---"+ second.getOrderId() +"---"+ second.getCount() +"---"+ second.getTimestamp());
}
});
//7打印输出
dataStream.print();
//8 执行
env.execute();
}
}
这里我在截图里面已经标注了关联的情况了!
总结
可以看到,对于滚动窗口和滑动窗口,都是可以实现Join的,只不过需要注意的是watermark一定要推进才会数据关联哦,并且对于滑动窗口是有数据重复的。