interval join是区间join, 为了效率一般设置有一定的范围, 但是某些数据超出设置范围就会丢失, 对于无法容忍丢失的场景, 可以使用cogroup+侧输出流+connect 解决, 将join不到的数据保存到指定介质.
大概思路如下
- 使用windowAll进行数据筛选, 延迟很久的数据利用sideOutputLateData也能被下发
- 使用cogroup 将窗口数据保存, 然后处理数据, 输出到主流
- 在主流分流, join上的数据还是放在主流, 而join不上的数据, 输出到侧输出流
- 将join失败流和延迟流进行connect, keyby分组后, 进行process处理.
- 分组后相同的key会在一起, 所以先到达的数据保存到状态, 然后设置定时器, 后到达的数据查找对方状态进行join, 假如定时器响应之后, 后来的数据还没有到达, 后续保存到指定介质.
流程图
缺点就是需要自己实现join逻辑
简单Demo 如下, flink 版本是1.14的:
package com.kimi.flink.dataStream.demo.api;
import io.netty.handler.codec.DateFormatter;
import lombok.Data;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.*;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.co.CoProcessFunction;
import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction;
import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.triggers.Trigger;
import org.apache.flink.streaming.api.windowing.triggers.TriggerResult;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.api.windowing.windows.Window;
import org.apache.flink.util.Collector;
import org.apache.flink.util.StringUtils;
import org.apache.flink.util.OutputTag;
import java.io.Serializable;
import java.sql.Timestamp;
import java.time.*;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.TimeUnit;
public class CoGroupJoinAndConnectExample {
private static final DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Student> stream1 = getSocketStream(env, "localhost", 8888);
SingleOutputStreamOperator<Student> stream2 = getSocketStream(env, "localhost", 9999);
OutputTag<Student> stream1LateTag = new OutputTag<Student>("stream1LateTag", TypeInformation.of(new TypeHint<Student>() {})) {};
OutputTag<Student> stream2LateTag = new OutputTag<Student>("stream2LateTag", TypeInformation.of(new TypeHint<Student>() {})) {};
OutputTag<Student> joinFailTag = new OutputTag<Student>("joinFail", TypeInformation.of(new TypeHint<Student>() {})) {};
OutputTag<Student> timerFailTag = new OutputTag<Student>("timerFail", TypeInformation.of(new TypeHint<Student>() {})) {};
int windowSize = 10;
SingleOutputStreamOperator<Student> lateStream1WithWindow = stream1
.windowAll(TumblingEventTimeWindows.of(Time.seconds(windowSize)))
.sideOutputLateData(stream1LateTag)
.apply(new AllWindowFunction<Student, Student, TimeWindow>() {
@Override
public void apply(TimeWindow window, Iterable<Student> stream1values, Collector<Student> out) throws Exception {
for (Student student : stream1values) {
out.collect(student);
}
}
});
SingleOutputStreamOperator<Student> lateStream2WithWindow = stream2
.windowAll(TumblingEventTimeWindows.of(Time.seconds(windowSize)))
.sideOutputLateData(stream2LateTag)
.apply(new AllWindowFunction<Student, Student, TimeWindow>() {
@Override
public void apply(TimeWindow window, Iterable<Student> stream2values, Collector<Student> out) throws Exception {
for (Student student : stream2values) {
out.collect(student);
}
}
});
DataStream<Student> lateStream1 = lateStream1WithWindow.getSideOutput(stream1LateTag);
DataStream<Student> lateStream2 = lateStream2WithWindow.getSideOutput(stream2LateTag);
DataStream<Student> lateStream = lateStream1.union(lateStream2);
lateStream.print("发现迟到数据: ");
DataStream<Student> joinedStream = lateStream1WithWindow
.coGroup(lateStream2WithWindow)
.where((KeySelector<Student, String>) Student::getId)
.equalTo((KeySelector<Student, String>) Student::getId)
.window(TumblingEventTimeWindows.of(Time.seconds(windowSize)))
.apply(new CoGroupFunction<Student, Student, Student>() {
@Override
public void coGroup(Iterable<Student> stream1,
Iterable<Student> stream2,
Collector<Student> out) throws Exception {
boolean stream1IfEmpty = false;
for (Student student1 : stream1) {
stream1IfEmpty = true;
boolean isJoined = false;
for (Student student2 : stream2) {
student1.setSex(student2.getSex());
out.collect(student1);
isJoined = true;
}
if (!isJoined) {
out.collect(student1);
}
}
if (!stream1IfEmpty) {
for (Student student : stream2) {
out.collect(student);
}
}
}
}
);
SingleOutputStreamOperator<Student> processStream = joinedStream.process(new ProcessFunction<Student, Student>() {
@Override
public void processElement(Student student, Context ctx, Collector<Student> out) throws Exception {
String name = student.getName();
String sex = student.getSex();
if (name != null && sex != null) {
out.collect(student);
} else {
ctx.output(joinFailTag, student);
}
}
}).returns(TypeInformation.of(new TypeHint<Student>() {
}));
DataStream<Student> failStream = processStream.getSideOutput(joinFailTag);
SingleOutputStreamOperator<Student> connectStream = failStream.connect(lateStream)
.keyBy((KeySelector<Student, String>) Student::getId,
(KeySelector<Student, String>) Student::getId)
.process(new CoProcessFunction<Student, Student, Student>() {
private ValueState<Student> failStreamState;
private ValueState<Student> lateStreamState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
failStreamState = getRuntimeContext().getState(new ValueStateDescriptor<>("failStream", Student.class));
lateStreamState = getRuntimeContext().getState(new ValueStateDescriptor<>("lateStream", Student.class));
}
@Override
public void processElement1(Student failStreamStudent, Context ctx, Collector<Student> out) throws Exception {
if (lateStreamState.value() != null) {
if (failStreamStudent.getSex() == null && lateStreamState.value().getSex() != null) {
failStreamStudent.setSex(lateStreamState.value().getSex());
} else if (failStreamStudent.getName() == null && lateStreamState.value().getName() != null) {
failStreamStudent.setName(lateStreamState.value().getName());
}
out.collect(failStreamStudent);
lateStreamState.clear();
} else {
failStreamState.update(failStreamStudent);
System.out.println("failStream注册定时器: " + failStreamStudent);
ctx.timerService().registerEventTimeTimer(failStreamState.value().getLogTime().getTime() + 20000);
}
}
@Override
public void processElement2(Student lateStreamStudent, Context ctx, Collector<Student> out) throws Exception {
if (failStreamState.value() != null) {
if (lateStreamStudent.getSex() == null && failStreamState.value().getSex() != null) {
lateStreamStudent.setSex(failStreamState.value().getSex());
} else if (lateStreamStudent.getName() == null && failStreamState.value().getName() != null) {
lateStreamStudent.setName(failStreamState.value().getName());
}
out.collect(lateStreamStudent);
failStreamState.clear();
} else {
lateStreamState.update(lateStreamStudent);
System.out.println("lateStream注册定时器 " + lateStreamStudent);
ctx.timerService().registerEventTimeTimer(lateStreamState.value().getLogTime().getTime() + 20000);
}
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<Student> out) throws Exception {
super.onTimer(timestamp, ctx, out);
if (failStreamState.value() != null) {
ctx.output(timerFailTag, failStreamState.value());
}
if (lateStreamState.value() != null) {
ctx.output(timerFailTag, lateStreamState.value());
}
failStreamState.clear();
lateStreamState.clear();
}
}).returns(TypeInformation.of(new TypeHint<Student>() {
}));
DataStream<Student> timerFailStream = connectStream.getSideOutput(timerFailTag);
processStream.print("主流join成功 ===> ");
connectStream.print("定时器join成功 ===> ");
timerFailStream.print("定时器join失败 ===> ");
env.execute("Flink CoGroup Join + connect Example");
}
private static SingleOutputStreamOperator<Student> getSocketStream(
StreamExecutionEnvironment env,
String hostname,
int port) {
return env.socketTextStream(hostname, port)
.map(new MapFunction<String, Student>() {
@Override
public Student map(String value) throws Exception {
if (!StringUtils.isNullOrWhitespaceOnly(value)) {
String[] parts = value.split(",");
if (parts.length == 3) {
Student student = new Student();
student.setId(parts[0].trim());
if (port == 8888) {
student.setName(parts[1].trim());
} else if (port == 9999) {
student.setSex(parts[1].trim());
}
Timestamp timestamp = Timestamp.valueOf(LocalDateTime.parse(parts[2], formatter));
student.setLogTime(timestamp);
return student;
}
}
return null;
}
})
.filter(Objects::nonNull)
.assignTimestampsAndWatermarks(WatermarkStrategy.<Student>forBoundedOutOfOrderness(
Duration.ofSeconds(0))
.withTimestampAssigner(new SerializableTimestampAssigner<Student>() {
@Override
public long extractTimestamp(Student element, long recordTimestamp) {
return element.getLogTime().getTime();
}
}));
}
@Data
static class Student implements Serializable {
private String id;
private String name;
private String sex;
private Timestamp logTime;
}
}