一、intervalJoin
流A intervalJoin 流B,只要满足以下条件即为join成功。
流B的时间戳大于等于流A的时间戳减下界
且流B的时间戳小于等于流A的时间戳加上界
且流A的key等于流B的key。
ProcessJoinFunction
public class TwoStreamJoinStream {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStreamSource<String> sourceStream1 = env.addSource(new FlinkKafkaConsumer010<String>("stream1", new SimpleStringSchema(), KafkaUtils.comsumerProps()));
DataStreamSource<String> sourceStream2 = env.addSource(new FlinkKafkaConsumer010<String>("stream2", new SimpleStringSchema(), KafkaUtils.comsumerProps()));
KeyedStream<String, String> streamJoin1 = sourceStream1
.assignTimestampsAndWatermarks(new EventTimeExtractor())
.keyBy(new KeySelector<String, String>() {
@Override
public String getKey(String line) throws Exception {
JSONObject jn = JSON.parseObject(line);
return jn.getString("uuid");
}
});
KeyedStream<String, String> streamJoin2 = sourceStream2
.assignTimestampsAndWatermarks(new EventTimeExtractor())
.keyBy(new KeySelector<String, String>() {
@Override
public String getKey(String line) throws Exception {
JSONObject jn = JSON.parseObject(line);
return jn.getString("uuid");
}
});
SingleOutputStreamOperator<String> resuleStream = streamJoin1
.intervalJoin(streamJoin2)
// streamJoin2的时间戳 >= streamJoin1的时间戳-3 并且 streamJoin2的时间戳 <= streamJoin1的时间戳+3
// 且 streamJoin1的key = streamJoin2的key 即可关联上
.between(Time.seconds(-3), Time.seconds(3))
.process(new ProcessJoinFunction<String, String, String>() {
@Override
public void processElement(String s, String s2, Context context, Collector<String> collector) throws Exception {
JSONObject jn1 = JSON.parseObject(s);
JSONObject jn2 = JSON.parseObject(s2);
JSONObject jn = new JSONObject();
jn.put("uuid", jn1.get("uuid"));
jn.put("time1", jn1.get("time"));
jn.put("time2", jn2.get("time"));
jn.put("tag1", jn1.get("tag1"));
jn.put("tag2", jn2.get("tag2"));
collector.collect(jn.toJSONString());
}
});
resuleStream.print();
env.execute();
}
}
二、window join
流A与流B按照相同的规则开窗,同一窗口内的且流A的key等于流B的key及表示关联上。
public class TwoStreamJoinStream {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStreamSource<String> sourceStream1 = env.addSource(new FlinkKafkaConsumer010<String>("stream1", new SimpleStringSchema(), KafkaUtils.comsumerProps()));
DataStreamSource<String> sourceStream2 = env.addSource(new FlinkKafkaConsumer010<String>("stream2", new SimpleStringSchema(), KafkaUtils.comsumerProps()));
SingleOutputStreamOperator<String> streamJoin1 = sourceStream1
.assignTimestampsAndWatermarks(new EventTimeExtractor());
SingleOutputStreamOperator<String> streamJoin2 = sourceStream2
.assignTimestampsAndWatermarks(new EventTimeExtractor());
DataStream<String> resuleStream = streamJoin1
.join(streamJoin2)
// 流streamJoin1的关联字段
.where(new KeySelector<String, String>() {
@Override
public String getKey(String s) throws Exception {
return JSON.parseObject(s).getString("uuid");
}
})
// 流streamJoin2的关联字段
.equalTo(new KeySelector<String, String>() {
@Override
public String getKey(String s) throws Exception {
return JSON.parseObject(s).getString("uuid");
}
})
// 窗口
.window(TumblingEventTimeWindows.of(Time.seconds(3)))
.apply(new FlatJoinFunction<String, String, String>() {
@Override
public void join(String s, String s2, Collector<String> collector) throws Exception {
JSONObject jn1 = JSON.parseObject(s);
JSONObject jn2 = JSON.parseObject(s2);
JSONObject jn = new JSONObject();
jn.put("uuid", jn1.get("uuid"));
jn.put("time1", jn1.get("time"));
jn.put("time2", jn2.get("time"));
jn.put("tag1", jn1.get("tag1"));
jn.put("tag2", jn2.get("tag2"));
collector.collect(jn.toJSONString());
}
});
resuleStream.print();
env.execute();
}
}
三、CoProcessFunction实现双流join
流A connect 流B。在KeyedCoProcessFunction里,创建两个MapState,每个流对应一个MapState。
每个流接收到一条数据是都尝试从另外一个流对应的state中获取能关联上的数据,如果获取到则输出,并删除将其从另外一个流对应的state中删除。如果没获取到,则将此数据放入当前流对应的state里。
public class TwoStreamJoinStream {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(3);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
DataStreamSource<String> sourceStream1 = env.addSource(new FlinkKafkaConsumer010<String>("stream1", new SimpleStringSchema(), KafkaUtils.comsumerProps()));
DataStreamSource<String> sourceStream2 = env.addSource(new FlinkKafkaConsumer010<String>("stream2", new SimpleStringSchema(), KafkaUtils.comsumerProps()));
KeyedStream<String, String> streamJoin1 = sourceStream1
.keyBy(new KeySelector<String, String>() {
@Override
public String getKey(String line) throws Exception {
JSONObject jn = JSON.parseObject(line);
return jn.getString("uuid");
}
});
KeyedStream<String, String> streamJoin2 = sourceStream2
.keyBy(new KeySelector<String, String>() {
@Override
public String getKey(String line) throws Exception {
JSONObject jn = JSON.parseObject(line);
return jn.getString("uuid");
}
});
SingleOutputStreamOperator<String> resuleStream = streamJoin1
.connect(streamJoin2)
.process(new KeyedCoProcessFunction<String, String, String, String>() {
private MapState<String, String> state1;
private MapState<String, String> state2;
@Override
public void open(Configuration parameters) throws Exception {
state1 = getRuntimeContext().getMapState(new MapStateDescriptor<String, String>("state1", String.class, String.class));
state2 = getRuntimeContext().getMapState(new MapStateDescriptor<String, String>("state2", String.class, String.class));
super.open(parameters);
}
@Override
public void processElement1(String s, Context context, Collector<String> collector) throws Exception {
String uuid = JSON.parseObject(s).getString("uuid");
if (state2.contains(uuid)) {
String s2 = state2.get(uuid);
JSONObject jn1 = JSON.parseObject(s);
JSONObject jn2 = JSON.parseObject(s2);
JSONObject jn = new JSONObject();
jn.put("uuid", jn1.get("uuid"));
jn.put("time1", jn1.get("time"));
jn.put("time2", jn2.get("time"));
jn.put("tag1", jn1.get("tag1"));
jn.put("tag2", jn2.get("tag2"));
collector.collect(jn.toJSONString());
state2.remove(uuid);
} else {
state1.put(uuid, s);
}
}
@Override
public void processElement2(String s, Context context, Collector<String> collector) throws Exception {
String uuid = JSON.parseObject(s).getString("uuid");
if (state1.contains(uuid)) {
String s2 = state1.get(uuid);
JSONObject jn1 = JSON.parseObject(s);
JSONObject jn2 = JSON.parseObject(s2);
JSONObject jn = new JSONObject();
jn.put("uuid", jn1.get("uuid"));
jn.put("time1", jn2.get("time"));
jn.put("time2", jn1.get("time"));
jn.put("tag1", jn1.get("tag2"));
jn.put("tag2", jn2.get("tag1"));
collector.collect(jn.toJSONString());
state1.remove(uuid);
} else {
state2.put(uuid, s);
}
}
});
resuleStream.print();
env.execute();
}
}