0 项目依赖
<properties>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.12</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.12</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc_2.12</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
1 基本操作
- flatMap
- filter
- map
- keyBy
- sum
- reduce
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class TransformationExamples {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
DataStream<String> linesDS = env.fromElements("hadoop,hdfs,flink,spark",
"hadoop,hdfs,flink",
"hadoop,hdfs",
"fu,you",
"you,fu",
"hadoop,spark");
// 处理数据
// flatMap 每行按照逗号分割组成集合
DataStream<String> wordsDS = linesDS.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String value, Collector<String> out) throws Exception {
// value是一行单词
String[] words = value.split(",");
for (String word : words) {
// 收集每个word
out.collect(word);
}
}
});
// 过滤掉 fu
// filter
DataStream<String> filtered_wordsDS = wordsDS.filter(new FilterFunction<String>() {
@Override
public boolean filter(String s) throws Exception {
if (s.equals("fu")){
System.out.println("检测到“fu”!");
return false;
}
return true;
}
});
// 将每个单词记为1,并组成元组返回
// map
DataStream<Tuple2<String, Integer>> word_1 = filtered_wordsDS.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String value) throws Exception {
return Tuple2.of(value, 1);
}
});
// 对数据按照单词分组
// keyBy
KeyedStream<Tuple2<String, Integer>, String> word_1_group = word_1.keyBy(tuple2 -> tuple2.f0);
// 把分组的数据按照索引聚合
// sum 方式
//DataStream<Tuple2<String, Integer>> result = word_1_group.sum(1);
// reduce 方式
DataStream<Tuple2<String, Integer>> result = word_1_group.reduce(new ReduceFunction<Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> reduce(Tuple2<String, Integer> t1, Tuple2<String, Integer> t2) throws Exception {
return Tuple2.of(t1.f0, t1.f1 + t2.f1);
}
});
result.print();
env.execute();
}
}
2 合并
- union : 合并
- connect : 连接
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;
import org.apache.flink.util.Collector;
public class TransformationExamples2 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
DataStream<String> d1 = env.fromElements("hadoop,spark,hdfs","hdfs,spark","spark");
DataStream<String> d2 = env.fromElements("java,python,c++","java,python","java");
DataStream<Long> d3 = env.fromElements(100L,110L,120L);
// 逗号分割
FlatMapFunction<String, String> flat = new FlatMapFunction<String, String>() {
@Override
public void flatMap(String s, Collector<String> collector) throws Exception {
String[] words = s.split(",");
for (String word : words) {
collector.collect(word);
}
}
};
d1 = d1.flatMap(flat);
d2 = d2.flatMap(flat);
// union 类型必须一致
DataStream<String> d1_d2 = d1.union(d2);
d1_d2 = d1_d2.map(new MapFunction<String, String>() {
@Override
public String map(String s) throws Exception {
return "d1_d2: " + s;
}
});
// connect 可以连接不同类型
ConnectedStreams<String, Long> connect = d1.connect(d3);
// 连接后需要进行下一步处理
DataStream<String> d1_d3 = connect.map(new CoMapFunction<String, Long, String>() {
@Override
public String map1(String s) throws Exception {
return "d1_d3: String: " + s;
}
@Override
public String map2(Long aLong) throws Exception {
return "d1_d3: Long: " + aLong;
}
});
d1_d2.print();
d1_d3.print();
env.execute();
}
}
输出:
3 拆分
- split : 分流
- select : 获取分流后的数据
- side outputs : 使用 process 方法对流中的数据进行处理,使用 outputTag 来记录处理结果
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
public class TransformationExamples3 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
DataStream<Integer> ds = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
// 定义单数输出标签
OutputTag<Integer> odd = new OutputTag<>("odd", TypeInformation.of(Integer.class));
// 取出单数
SingleOutputStreamOperator<Integer> process = ds.process(new ProcessFunction<Integer, Integer>() {
@Override
public void processElement(Integer integer, Context context, Collector<Integer> collector) throws Exception {
if (integer % 2 == 1) {
context.output(odd, integer);
}
}
});
// 按照标签取出
DataStream<Integer> oddNums = process.getSideOutput(odd);
oddNums.print();
env.execute();
}
}
4 分区
4.1 reblance 重平衡分区
防止一个分区处理过多数据从而延长运行时间
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class TransformationExamples4 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
DataStream<Long> longDS = env.fromSequence(0, 100);
// 随机分配一下,这个过程有可能出现数据倾斜
DataStream<Long> filterDS = longDS.filter(new FilterFunction<Long>() {
@Override
public boolean filter(Long num) throws Exception {
return num > 10;
}
});
// 定义 mapFunction 和 keySelector ,也可以用lambda写
RichMapFunction<Long,Tuple2<Integer, Integer>> mapFunction = new RichMapFunction<Long, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> map(Long aLong) throws Exception {
int id = getRuntimeContext().getIndexOfThisSubtask();
return Tuple2.of(id, 1);
}
};
KeySelector<Tuple2<Integer, Integer>, Integer> keySelector = new KeySelector<Tuple2<Integer, Integer>, Integer>() {
@Override
public Integer getKey(Tuple2<Integer, Integer> integerIntegerTuple2) throws Exception {
return integerIntegerTuple2.f0;
}
};
SingleOutputStreamOperator<Tuple2<Integer, Integer>> result1 =
filterDS.map(mapFunction)
.keyBy(keySelector)
.sum(1);
SingleOutputStreamOperator<Tuple2<Integer, Integer>> result2 =
filterDS.rebalance()
.map(mapFunction)
.keyBy(keySelector)
.sum(1);
result1.print("result1");//有可能出现数据倾斜
result2.print("result2");//解决了数据倾斜
env.execute();
}
}
输出如下,格式为(分区号,处理的数据数量),其中 result1 未平衡,result2 平衡:
- 未进行平衡分区的 result1 中,2号分区处理的数据较少,发生了数据倾斜
- 进行了重平衡分区的 result2 中,各个分区的处理的数据量大致相同,没有很大波动。
4.2 其他分区
分区类型 | 说明 |
---|---|
dataStream.global() | 全部发往第一个task |
dataStream.broadcast() | 广播 |
dataStream.forward() | 上下游并发度相同时一对一发送 |
dataStream.shuffle() | 随机均匀分配 |
dataStream.rebalance() | Round-Robin(重平衡) |
dataStream.partitionCustom() | 自定义 |
package Transformation;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.Partitioner;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class TransformationExamples5 {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC).setParallelism(3);
DataStream<Long> longDS = env.fromSequence(0, 100);
// 随机分配一下,这个过程有可能出现数据倾斜
DataStream<Long> filterDS = longDS.filter(new FilterFunction<Long>() {
@Override
public boolean filter(Long num) throws Exception {
return num > 10;
}
});
// 定义 mapFunction 和 keySelector ,也可以用lambda写
RichMapFunction<Long,Tuple2<Integer, Integer>> mapFunction = new RichMapFunction<Long, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> map(Long aLong) throws Exception {
int id = getRuntimeContext().getIndexOfThisSubtask();
return Tuple2.of(id, 1);
}
};
KeySelector<Tuple2<Integer, Integer>, Integer> keySelector = new KeySelector<Tuple2<Integer, Integer>, Integer>() {
@Override
public Integer getKey(Tuple2<Integer, Integer> integerIntegerTuple2) throws Exception {
return integerIntegerTuple2.f0;
}
};
DataStream<Tuple2<Integer, Integer>> result1 =
filterDS.map(mapFunction)
.keyBy(keySelector)
.sum(1);
DataStream<Tuple2<Integer, Integer>> result2 =
filterDS.global()
.map(mapFunction)
.keyBy(keySelector)
.sum(1);
DataStream<Tuple2<Integer, Integer>> result3 =
filterDS.broadcast()
.map(mapFunction)
.keyBy(keySelector)
.sum(1);
DataStream<Tuple2<Integer, Integer>> result4 =
filterDS.forward()
.map(mapFunction)
.keyBy(keySelector)
.sum(1);
DataStream<Tuple2<Integer, Integer>> result5 =
filterDS.shuffle()
.map(mapFunction)
.keyBy(keySelector)
.sum(1);
DataStream<Tuple2<Integer, Integer>> result6 =
filterDS.rebalance()
.map(mapFunction)
.keyBy(keySelector)
.sum(1);
DataStream<Tuple2<Integer, Integer>> result7 =
filterDS.partitionCustom(new Partitioner<Long>() {
@Override
public int partition(Long aLong, int i) {
return aLong > 50 ? 0 : 1;
}
}, new KeySelector<Long, Long>() {
@Override
public Long getKey(Long aLong) throws Exception {
return aLong;
}
})
.map(mapFunction)
.keyBy(keySelector)
.sum(1);
result1.print("result1:默认");
result2.print("result2:全部发到第一个task");
result3.print("result3:广播");
result4.print("result4:上下游并发度相同时一对一发送");
result5.print("result5:随机均匀分配");
result6.print("result6:Round-Robin");
result7.print("result7:自定义分配");
env.execute();
}
}
输出如下: