前言
Transform 数据转换算子,可以将一个或多个 DataStream 转换成新的 DataStream,可以将多个数据转换算子合并成一个复杂的数据流拓扑。
1.map
数据转换
@Test
public void mapTest() throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
env.fromCollection(List.of("java", "python", "flink"))
.map(new MapFunction<String, String>() {
@Override
public String map(String value) throws Exception {
return value.toUpperCase();
}
})
.print();
env.execute("flink map");
}
结果:
3> PYTHON
2> JAVA
4> FLINK
2.flatMap
数据扁平化
@Test
public void flatMapTest() throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
env.fromCollection(List.of(
"nacos,python,java",
"nacos,sentinel,gateway"))
.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String value, Collector<String> out) throws Exception {
Arrays.stream(value.split(",")).forEach(v -> out.collect(v));
}
})
.print();
env.execute("flink flatMap");
}
结果:
6> nacos
7> nacos
6> python
7> sentinel
7> gateway
6> java
3.filter
@Test
public void filterTest() throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
env.fromCollection(List.of(1, 2, 3, 4, 5, 6))
.filter(new FilterFunction<Integer>() {
@Override
public boolean filter(Integer value) throws Exception {
return value % 2 == 0;
}
})
.print();
env.execute("flink filter");
}
结果:
8> 4
6> 2
2> 6
4.keyBy
需要结合max、min、reduce等数据算子
@Test
public void keyByTest() throws Exception {
List<Student> students = List.of(new Student("a", 100L, LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("a", 90L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("b", 100L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("b", 80L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("c", 90L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("c", 70L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)));
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
env.setParallelism(1);
env.fromCollection(students)
.keyBy(new KeySelector<Student, String>() {
@Override
public String getKey(Student value) throws Exception {
return value.getName();
}
})
.print();
env.execute("flink streaming keyBy");
}
结果:
Student(name=a, score=100, birthday=2002-04-13T22:34:46.452201400)
Student(name=a, score=90, birthday=2002-04-13T22:34:46.452201400)
Student(name=b, score=100, birthday=2002-04-13T22:34:46.452201400)
Student(name=b, score=80, birthday=2002-04-13T22:34:46.452201400)
Student(name=c, score=90, birthday=2002-04-13T22:34:46.452201400)
Student(name=c, score=70, birthday=2002-04-13T22:34:46.452201400)
5.max
求最大值
@Test
public void maxTest() throws Exception {
List<Student> students = List.of(new Student("a", 100L, LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("a", 90L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("b", 100L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("b", 80L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("c", 90L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("c", 70L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)));
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
env.setParallelism(1);
env.fromCollection(students)
.keyBy(new KeySelector<Student, String>() {
@Override
public String getKey(Student value) throws Exception {
return value.getName();
}
})
.max("score")
.print();
env.execute("flink streaming max");
}
结果: 分组后每组的最大值
Student(name=a, score=100, birthday=2002-04-13T22:36:11.630207700)
Student(name=b, score=100, birthday=2002-04-13T22:36:11.630207700)
Student(name=c, score=90, birthday=2002-04-13T22:36:11.630207700)
还有min、minBy、maxBy、sum等
6.reduce
归约
@Test
public void reduceTest() throws Exception {
List<Student> students = List.of(new Student("a", 100L, LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("a", 90L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("b", 100L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("b", 80L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("c", 90L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)),
new Student("c", 70L,LocalDateTime.now().minus(20,ChronoUnit.YEARS)));
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
env.setParallelism(1);
env.fromCollection(students)
.keyBy(new KeySelector<Student, String>() {
@Override
public String getKey(Student value) throws Exception {
return value.getName();
}
})
.reduce((v1,v2) -> new Student(v1.getName(),v1.getScore() + v2.getScore(),v1.getBirthday()))
.print();
env.execute("flink streaming reduce");
}
结果:分数合并
Student(name=a, score=190, birthday=2002-04-13T22:42:17.541400)
Student(name=b, score=180, birthday=2002-04-13T22:42:17.542397200)
Student(name=c, score=160, birthday=2002-04-13T22:42:17.542397200)
7.connect
连接两个数据源,对两个数据源是单独操作,可以进行 CoMapFunction、CoFlatMapFunction、CoProcessFunction等操作
@Test
public void connectTest() throws Exception {
List<Student> students = List.of(new Student("a", 100L, LocalDateTime.now().minus(20, ChronoUnit.YEARS)),
new Student("a", 90L, LocalDateTime.now().minus(20, ChronoUnit.YEARS)),
new Student("b", 100L, LocalDateTime.now().minus(20, ChronoUnit.YEARS)),
new Student("b", 80L, LocalDateTime.now().minus(20, ChronoUnit.YEARS)),
new Student("c", 90L, LocalDateTime.now().minus(20, ChronoUnit.YEARS)),
new Student("c", 70L, LocalDateTime.now().minus(20, ChronoUnit.YEARS)));
List<Integer> socres = List.of(100, 90, 80, 70, 60);
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
env.setParallelism(1);
DataStreamSource<Student> source1 = env.fromCollection(students);
DataStreamSource<Integer> source2 = env.fromCollection(socres);
source1.connect(source2).map(new CoMapFunction<Student, Integer, Long>() {
@Override
public Long map2(Integer value) throws Exception {
return value.longValue();
}
@Override
public Long map1(Student value) throws Exception {
return value.getScore();
}
})
.print();
env.execute("flink streaming reduce");
}
结果:
100
100
90
90
80
100
70
80
60
90
70
8.union
联合多个个数据源,数据源类型相同
@Test
public void unionTest() throws Exception {
List<Integer> socres1 = List.of(50, 40);
List<Integer> socres2 = List.of(100, 90);
List<Integer> socres3 = List.of(80, 70);
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
env.setParallelism(1);
DataStreamSource<Integer> source1 = env.fromCollection(socres1);
DataStreamSource<Integer> source2 = env.fromCollection(socres2);
DataStreamSource<Integer> source3 = env.fromCollection(socres3);
source1.union(source2).union(source3).map(new MapFunction<Integer, Integer>() {
@Override
public Integer map(Integer value) throws Exception {
return value * 2;
}
})
.print();
env.execute("flink streaming reduce");
}
结果:
100
80
200
180
160
140
9.process
数据打标签示例
@Test
public void processTest() throws Exception {
List<Student> students = List.of(new Student("a", 100L, LocalDateTime.now().minus(20, ChronoUnit.YEARS)),
new Student("a", 90L, LocalDateTime.now().minus(20, ChronoUnit.YEARS)),
new Student("b", 100L, LocalDateTime.now().minus(20, ChronoUnit.YEARS)),
new Student("b", 80L, LocalDateTime.now().minus(20, ChronoUnit.YEARS)),
new Student("c", 90L, LocalDateTime.now().minus(20, ChronoUnit.YEARS)),
new Student("c", 70L, LocalDateTime.now().minus(20, ChronoUnit.YEARS)));
//数据标签
OutputTag<Long> tag1 = new OutputTag<>("优秀", TypeInformation.of(Long.class));
OutputTag<Long> tag2 = new OutputTag<>("良好", TypeInformation.of(Long.class));
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
env.setParallelism(1);
SingleOutputStreamOperator<Long> process = env.fromCollection(students)
.process(new ProcessFunction<Student, Long>() {
@Override
public void processElement(Student value, Context ctx, Collector<Long> out) throws Exception {
if (value.getScore() >= 90) {
ctx.output(tag1, value.getScore());
} else {
ctx.output(tag2, value.getScore());
}
}
});
DataStream<Long> output1 = process.getSideOutput(tag1);
DataStream<Long> output2 = process.getSideOutput(tag2);
output1.print("优秀");
output2.print("良好");
env.execute("flink process");
}
结果:
优秀> 100
优秀> 90
优秀> 100
良好> 80
优秀> 90
良好> 70