CoGroup算子:将两个数据流按照key进行group分组,并将数据流按key进行分区的处理,最终合成一个数据流(与join有区别,不管key有没有关联上,最终都会合并成一个数据流)
示例环境
java.version: 1.8.x
flink.version: 1.11.1
示例数据源 (项目码云下载)
CoGroup.java
package com.flink.examples.functions;
import com.flink.examples.DataSource;
import com.google.gson.Gson;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
import java.time.Duration;
import java.util.Arrays;
import java.util.List;
/**
* @Description CoGroup算子:将两个数据流按照key进行group分组,并将数据流按key进行分区的处理,最终合成一个数据流(与join有区别,不管key有没有关联上,最终都会合并成一个数据流)
*/
public class CoGroup {
/**
* 两个数据流集合,对相同key进行内联,分配到同一个窗口下,合并并打印
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//watermark 自动添加水印调度时间
//env.getConfig().setAutoWatermarkInterval(200);
List<Tuple3<String, String, Integer>> tuple3List1 = DataSource.getTuple3ToList();
List<Tuple3<String, String, Integer>> tuple3List2 = Arrays.asList(
new Tuple3<>("伍七", "girl", 18),
new Tuple3<>("吴八", "man", 30)
);
//Datastream 1
DataStream<Tuple3<String, String, Integer>> dataStream1 = env.fromCollection(tuple3List1)
//添加水印窗口,如果不添加,则时间窗口会一直等待水印事件时间,不会执行apply
.assignTimestampsAndWatermarks(WatermarkStrategy
.<Tuple3<String, String, Integer>>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner((element, timestamp) -> System.currentTimeMillis()));
//Datastream 2
DataStream<Tuple3<String, String, Integer>> dataStream2 = env.fromCollection(tuple3List2)
//添加水印窗口,如果不添加,则时间窗口会一直等待水印事件时间,不会执行apply
.assignTimestampsAndWatermarks(WatermarkStrategy
.<Tuple3<String, String, Integer>>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Integer>>() {
@Override
public long extractTimestamp(Tuple3<String, String, Integer> element, long timestamp) {
return System.currentTimeMillis();
}
})
);
//对dataStream1和dataStream2两个数据流进行关联,没有关联也保留
//Datastream 3
DataStream<String> newDataStream = dataStream1.coGroup(dataStream2)
.where(new KeySelector<Tuple3<String, String, Integer>, String>() {
@Override
public String getKey(Tuple3<String, String, Integer> value) throws Exception {
return value.f1;
}
})
.equalTo(t3->t3.f1)
.window(TumblingEventTimeWindows.of(Time.seconds(1)))
.apply(new CoGroupFunction<Tuple3<String, String, Integer>, Tuple3<String, String, Integer>, String>() {
@Override
public void coGroup(Iterable<Tuple3<String, String, Integer>> first, Iterable<Tuple3<String, String, Integer>> second, Collector<String> out) throws Exception {
StringBuilder sb = new StringBuilder();
Gson gson = new Gson();
//datastream1的数据流集合
for (Tuple3<String, String, Integer> tuple3 : first) {
sb.append(gson.toJson(tuple3)).append("\n");
}
//datastream2的数据流集合
for (Tuple3<String, String, Integer> tuple3 : second) {
sb.append(gson.toJson(tuple3)).append("\n");
}
out.collect(sb.toString());
}
});
newDataStream.print();
env.execute("flink CoGroup job");
}
}
打印结果
{"f0":"张三","f1":"man","f2":20}
{"f0":"王五","f1":"man","f2":29}
{"f0":"吴八","f1":"man","f2":30}
{"f0":"吴八","f1":"man","f2":30}
{"f0":"李四","f1":"girl","f2":24}
{"f0":"刘六","f1":"girl","f2":32}
{"f0":"伍七","f1":"girl","f2":18}
{"f0":"伍七","f1":"girl","f2":18}