文章目录
Flink 的多流合并的机制是以 FIFO 的方式合并多条流。
union
多条流的元素类型必须一样
可以合并多条流:stream1.union(stream2, stream3)
connect
只能合并两条流
两条流的元素的类型可以不一样
intervalJoin
DataStream API
CoMapFunction<IN1, IN2, OUT>
map1
map2
CoFlatMapFunction<IN1, IN2, OUT>
flatMap1:来自第一条流的事件进入CoFlatMapFunction,触发调用。
flatMap2:来自第二条流的事件进入CoFlatMapFunction,触发调用。
底层API
CoProcessFunction<IN1, IN2, OUT>
processElement1
processElement2
BroadcastProcessFunction<IN1, IN2, OUT>
processElement
processBroadcastElement 处理广播流数据
KeyedBroadcastProcessFunction
ProcessJoinFunction<IN1, IN2, OUT>
JoinFunction
union
public class UnionDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<Integer> streamOne = env.fromElements(1, 2);
DataStreamSource<Integer> streamTwo = env.fromElements(3, 4);
DataStreamSource<Integer> streamThree = env.fromElements(5, 6);
// union
// 1. 多条流的合并
// 2. 所有流中的事件类型必须是一样的
// 先来先处理
DataStream<Integer> unionStream = streamOne.union(streamTwo, streamThree);
unionStream.print();
env.execute();
}
}
connect
CoFlatMapFunction & CoMapFunction & connect
public class CoFlatMapFunctionDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<Event> clickStream = env.addSource(new ClickSource());
/*
nc -lk 9092
Mary
Alice
*/
DataStreamSource<String> ruleStream = env.socketTextStream("hadoop102", 9092);
// connect
// 只能合并两条流
// 两条流的元素的类型可以不一样
ConnectedStreams<Event, String> connectedStreams = clickStream
.connect(ruleStream);
connectedStreams
.flatMap(
//CoMapFunction
new CoFlatMapFunction<Event, String, Event>() {
private String rule;
@Override
public void flatMap1(Event value, Collector<Event> out) throws Exception {
// 按规则匹配 点击流
if (StringUtils.equals(rule,value.user)) out.collect(value);
}
@Override
public void flatMap2(String value, Collector<Event> out) throws Exception {
// 处理 connect 中的 规则流
rule = value;
}
}
)
.print();
env.execute();
/*
1> Event{user='Mary', url='./cart', timestamp=2023-03-01 21:46:14.381}
1> Event{user='Mary', url='./prod?id=2', timestamp=2023-03-01 21:46:24.27}
1> Event{user='Mary', url='./prod?id=2', timestamp=2023-03-01 21:46:29.159}
1> Event{user='Mary', url='./prod?id=1', timestamp=2023-03-01 21:46:47.629}
1> Event{user='Mary', url='./prod?id=2', timestamp=2023-03-01 21:46:54.159}
1> Event{user='Mary', url='./cart', timestamp=2023-03-01 21:47:00.694}
1> Event{user='Mary', url='./prod?id=2', timestamp=2023-03-01 21:47:03.958}
2> Event{user='Alice', url='./cart', timestamp=2023-03-01 21:47:08.952}
1> Event{user='Mary', url='./prod?id=2', timestamp=2023-03-01 21:47:12.115}
2> Event{user='Alice', url='./fav', timestamp=2023-03-01 21:47:13.849}
2> Event{user='Alice', url='./home', timestamp=2023-03-01 21:47:17.104}
1> Event{user='Mary', url='./fav', timestamp=2023-03-01 21:47:18.63}
1> Event{user='Mary', url='./prod?id=1', timestamp=2023-03-01 21:47:23.528}
2> Event{user='Alice', url='./fav', timestamp=2023-03-01 21:47:28.53}
1> Event{user='Mary', url='./home', timestamp=2023-03-01 21:47:31.692}
*/
}
CoProcessFunction & connect
public class CoProcessFunctionDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
SingleOutputStreamOperator<Event> clickStream = env
.addSource(new ClickSource());
// .filter(r -> "./home".equals(r.url) || "./fav".equals(r.url));
// ./home 放行 5 s
// ./fav 放行 10 s
DataStreamSource<Tuple2<String, Long>> switchStream = env
.fromElements(
Tuple2.of("./home", 5 * 1000L),
Tuple2.of("./fav", 15 * 1000L)
);
clickStream
.connect(switchStream)
.keyBy(cR -> cR.url,rR -> rR.f0)
.process(
new CoProcessFunction<Event, Tuple2<String, Long>, Event>() {
private ValueState<Boolean> enableThroughTs ;
@Override
public void open(Configuration parameters) throws Exception {
enableThroughTs = getRuntimeContext().getState(new ValueStateDescriptor<Boolean>("enableThroughTs", Types.BOOLEAN));
}
@Override
public void processElement1(Event value, Context ctx, Collector<Event> out) throws Exception {
if (enableThroughTs.value() != null && enableThroughTs.value()){
out.collect(value);
}
}
@Override
public void processElement2(Tuple2<String, Long> value, Context ctx, Collector<Event> out) throws Exception {
// 处理开关流
// 打开开关
enableThroughTs.update(true);
// 注册定时器关闭开关
ctx.timerService().registerProcessingTimeTimer(ctx.timerService().currentProcessingTime() + value.f1);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<Event> out) throws Exception {
// 关闭开关
enableThroughTs.clear();
}
}
)
.print();
env.execute();
}
}
ProcessJoinFunction & intervalJoin
public class ProcessJoinFunctionDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Tuple3<String, String, Long>> clickStream = env
.fromElements(
Tuple3.of("user-1", "click", 12 * 60 * 1000L)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple3<String, String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(0L)).withTimestampAssigner((Tuple3<String, String, Long> element, long recordTimestamp) -> element.f2)
);
SingleOutputStreamOperator<Tuple3<String, String, Long>> browseStream = env
.fromElements(
Tuple3.of("user-1", "browse", 1 * 60 * 1000L),
Tuple3.of("user-1", "browse", 7 * 60 * 1000L),
Tuple3.of("user-1", "browse", 10 * 60 * 1000L),
Tuple3.of("user-1", "browse", 11 * 60 * 1000L),
Tuple3.of("user-1", "browse", 20 * 60 * 1000L)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple3<String, String, Long>>forBoundedOutOfOrderness(Duration.ofSeconds(0L)).withTimestampAssigner((Tuple3<String, String, Long> element, long recordTimestamp) -> element.f2)
);
clickStream
.keyBy(r -> r.f0)
.intervalJoin(browseStream.keyBy(r -> r.f0))
// browse + 10 < clickTs < browse + 15
// 2 < 12 < 17
.between(Time.minutes(-10),Time.minutes(5))
.process(
new ProcessJoinFunction<Tuple3<String, String, Long>, Tuple3<String, String, Long>, String>() {
@Override
public void processElement(Tuple3<String, String, Long> left, Tuple3<String, String, Long> right, Context ctx, Collector<String> out) throws Exception {
out.collect(left + " -> " + right);
}
}
)
.print();
env.execute();
}
}
BroadcastProcessFunction & connect
public class BroadcastProcessFunctionDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 点击流
DataStreamSource<Event> clickStream = env.addSource(new ClickSource());
// 广播流
DataStreamSource<Tuple2<String, Integer>> infoStream = env
.fromElements(
Tuple2.of("Mary", 100),
Tuple2.of("Bob", 100),
Tuple2.of("Alice", 100)
);
// 配置广播流
MapStateDescriptor<String, Tuple2<String, Integer>> broadCastDescriptor = new MapStateDescriptor<String, Tuple2<String, Integer>>("broad-test", Types.STRING, Types.TUPLE(Types.STRING, Types.INT));
BroadcastStream<Tuple2<String, Integer>> broadcastStream = infoStream.broadcast(broadCastDescriptor);
clickStream
.connect(broadcastStream)
.process(
new BroadcastProcessFunction<Event, Tuple2<String, Integer>, String>() {
@Override
public void processElement(Event event, ReadOnlyContext ctx, Collector<String> out) throws Exception {
ReadOnlyBroadcastState<String, Tuple2<String, Integer>> broadcastState = ctx.getBroadcastState(new MapStateDescriptor<String, Tuple2<String, Integer>>("broad-test", Types.STRING, Types.TUPLE(Types.STRING, Types.INT)));//(broadCastDescriptor);
out.collect(event + " -> " + broadcastState.get(event.user));
}
// 处理广播流数据
@Override
public void processBroadcastElement(Tuple2<String, Integer> value, Context ctx, Collector<String> out) throws Exception {
// 设置广播流
BroadcastState<String, Tuple2<String, Integer>> broadcastState = ctx.getBroadcastState(new MapStateDescriptor<String, Tuple2<String, Integer>>("broad-test", Types.STRING, Types.TUPLE(Types.STRING, Types.INT)));//(broadCastDescriptor);
broadcastState.put(value.f0, value);
}
}
)
.print();
env.execute();
}
}
KeyedBroadcastProcessFunction & connect
public class KeyedBroadcastProcessFunctionDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 点击流
DataStreamSource<Event> clickStream = env.addSource(new ClickSource());
// 广播流
DataStreamSource<Tuple2<String, Integer>> infoStream = env
.fromElements(
Tuple2.of("Mary", 100),
Tuple2.of("Bob", 100),
Tuple2.of("Alice", 100)
);
// 配置广播流
MapStateDescriptor<String, Tuple2<String, Integer>> broadCastDescriptor = new MapStateDescriptor<String, Tuple2<String, Integer>>("broad-test", Types.STRING, Types.TUPLE(Types.STRING, Types.INT));
BroadcastStream<Tuple2<String, Integer>> broadcastStream = infoStream.broadcast(broadCastDescriptor);
clickStream
.keyBy(elem -> true)
.connect(broadcastStream)
.process(new KeyedBroadcastProcessFunction<Object, Event, Tuple2<String, Integer>, String>() {
@Override
public void processElement(Event event, ReadOnlyContext ctx, Collector<String> out) throws Exception {
ReadOnlyBroadcastState<String, Tuple2<String, Integer>> broadcastState = ctx.getBroadcastState(broadCastDescriptor);
out.collect(event + " -> " + broadcastState.get(event.user));
}
@Override
public void processBroadcastElement(Tuple2<String, Integer> value, Context ctx, Collector<String> out) throws Exception {
// 设置广播流
BroadcastState<String, Tuple2<String, Integer>> broadcastState = ctx.getBroadcastState(broadCastDescriptor);
broadcastState.put(value.f0, value);
}
})
.print();
env.execute();
}
}
JoinFunction
public class JoinWindowDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Tuple2<String, Long>> streamOne = env
.fromElements(
Tuple2.of("a", 1L),
Tuple2.of("b", 1L)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple2<String, Long>>forMonotonousTimestamps()
.withTimestampAssigner((Tuple2<String, Long> elem, long ts) -> elem.f1 * 1000L)
);
SingleOutputStreamOperator<Tuple2<String, Long>> streamTwo = env
.fromElements(
Tuple2.of("a", 1L),
Tuple2.of("a", 2L),
Tuple2.of("a", 5L),
Tuple2.of("a", 7L),
Tuple2.of("b", 4L),
Tuple2.of("b", 6L)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Tuple2<String, Long>>forMonotonousTimestamps()
.withTimestampAssigner((Tuple2<String, Long> elem, long ts) -> elem.f1 * 1000L)
);
streamOne
.join(streamTwo)
.where(r -> r.f0)
.equalTo(r -> r.f0)
.window(TumblingEventTimeWindows.of(Time.seconds(5L)))
.apply(new JoinFunction<Tuple2<String, Long>, Tuple2<String, Long>, String>() {
@Override
public String join(Tuple2<String, Long> first, Tuple2<String, Long> second) throws Exception {
return first + " -> " + second;
}
})
.print();
env.execute();
}
}
参考资料
https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/datastream/operators/joining/