Flink 算子总结
创建执行环境
StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
源算子
// socket源读取数据
DataStreamSource<String> streamSource = environment.socketTextStream("localhost", 7777);
// 文件读取数据
FileSource<String> fileSource = FileSource.forRecordStreamFormat(new TextLineInputFormat(),
new Path("data/words.txt")).build();
environment.fromSource(fileSource, WatermarkStrategy.noWatermarks(),"file").print();
//也可以是目录,读取该目录下所有文件;也可以读取HDFS,使用hdfs://...
//从集合读取
List<Integer> data = Arrays.asList(1, 22, 3);
DataStreamSource<Integer> ds = environment.fromCollection(data);
//从Kafka读取数据
KafkaSource<String> kafkaSource = KafkaSource.<String>builder().
setBootstrapServers("localhost:9092").setTopics("topic_1").
setStartingOffsets(OffsetsInitializer.latest()).
setValueOnlyDeserializer(new SimpleStringSchema()).build();
DataStreamSource<String> stream = environment.fromSource(kafkaSource,
WatermarkStrategy.noWatermarks(),
"kafka-source");
转换算子
- map
stream.map(new MapFunction<WaterSensor, String>() {
@Override
public String map(WaterSensor waterSensor) throws Exception {
return waterSensor.id;
}
}).print();
- filter
stream.filter(new FilterFunction<WaterSensor>() {
@Override
public boolean filter(WaterSensor waterSensor) throws Exception {
return waterSensor.id.equals("sensor_1");
}
}).print();
- flatmap
stream.flatMap(new FlatMapFunction<WaterSensor, Object>() {
@Override
public void flatMap(WaterSensor waterSensor, Collector<Object> collector) throws Exception {
if (waterSensor.id.equals("sensor_1")){
collector.collect(String.valueOf(waterSensor.vc));
} else if (waterSensor.id.equals("sensor_2")) {
collector.collect(String.valueOf(waterSensor.ts));
collector.collect(String.valueOf(waterSensor.vc));
}
}
}).print();
聚合
- keyBy
KeyedStream<WaterSensor, String> keyedStream = stream.keyBy(new KeySelector<WaterSensor, String>() {
@Override
public String getKey(WaterSensor waterSensor) throws Exception {
return waterSensor.id;
}
});
- reduce
environment.socketTextStream("localhost",7777).
map(new WaterSensorMapFunction()).keyBy(WaterSensor::getId).
reduce(new ReduceFunction<WaterSensor>() {
@Override
public WaterSensor reduce(WaterSensor ws_1, WaterSensor ws_2) throws Exception {
System.out.println("Demo7_Reduce.reduce");
int max = Math.max(ws_1.getVc(), ws_2.getVc());
if (ws_1.getVc() > ws_2.getVc()){
ws_1.setVc(max);
return ws_1;
}
else {
ws_2.setVc(max);
return ws_2;
}
}
}).print();
用户自定义
- Function Classes
SingleOutputStreamOperator<WaterSensor> filter = stream.filter(new UserFilter());
public static class UserFilter implements FilterFunction<WaterSensor>{
@Override
public boolean filter(WaterSensor waterSensor) throws Exception {
return waterSensor.id.equals("sensor_1");
}
}
SingleOutputStreamOperator<WaterSensor> filter = stream.filter(new FilterFunction<WaterSensor>() {
@Override
public boolean filter(WaterSensor waterSensor) throws Exception {
return waterSensor.id.equals("sensor_1");
}
});
- Rich Function Classes
environment.fromElements(1,2,3,4).map(new RichMapFunction<Integer, Integer>() {
/**
*
* Rich Function 的生命周期概念
* open方法:Rich Function的初始化方法,开启一个算子的生命周期,当一个算子实际工作方法被调用之前,open会首先被调用。
*
* @param parameters
* @throws Exception
*/
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
System.out.println("索引是:"+getRuntimeContext().getIndexOfThisSubtask()+"的任务的生命周期开始");
}
@Override
public Integer map(Integer integer) throws Exception {
return integer + 1;
}
/**
*
* close方法:生命周期中最后一个调用的方法,类似于结束方法,用来做一些清理工作。
*
* @throws Exception
*/
@Override
public void close() throws Exception {
super.close();
System.out.println("索引是:"+getRuntimeContext().getIndexOfThisSubtask()+"的任务的生命周期结束");
}
}).print();
物理分区
/*
* TODO shuffle:
* 把流中数据完全随机打乱,均匀传递到下游任务分区。
* */
stream.shuffle().print();
/*
* TODO rebalance:
* 把流中数据按照先后顺序依次分发,实现轮询重分区。
* 使用Round-Robin负载均衡算法
* */
stream.rebalance().print();
/*
* TODO broadcast:
* 数据在不同的分区都保留一份,将输入数据复制并发送到下游算子并行任务中。
* */
stream.broadcast().print();
/*
* TODO global:
* 全局分区,将所有输入流数据都发送到下游算子的第一个并行子任务中(让下游任务并行度变为 1 )
* 慎重使用
* */
stream.global().print();
- 自定义分区
DataStream<String> stringDataStream = stream.partitionCustom(new MyPartitioner(), value -> value);
public class MyPartitioner implements Partitioner<String> {
@Override
public int partition(String s, int i) {
return Integer.parseInt(s) % i;
}
}
- 分流
public class SplitStreamByOutputTag {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
SingleOutputStreamOperator<WaterSensor> ds = environment.socketTextStream("localhost", 7777).map(new WaterSensorMapFunction());
OutputTag<WaterSensor> s1 = new OutputTag<WaterSensor>("s1", Types.POJO(WaterSensor.class)) {};
OutputTag<WaterSensor> s2 = new OutputTag<WaterSensor>("s2", Types.POJO(WaterSensor.class)) {};
SingleOutputStreamOperator<WaterSensor> ds_1 = ds.process(new ProcessFunction<WaterSensor, WaterSensor>() {
@Override
public void processElement(WaterSensor waterSensor, ProcessFunction<WaterSensor, WaterSensor>.Context context, Collector<WaterSensor> collector) throws Exception {
if ("s1".equals(waterSensor.getId())) {
context.output(s1, waterSensor);
} else if ("s2".equals(waterSensor.getId())) {
context.output(s2, waterSensor);
} else {
collector.collect(waterSensor);
}
}
});
ds_1.print("主流,非s1,s2的传感器");
SideOutputDataStream<WaterSensor> sideOutput_1 = ds_1.getSideOutput(s1);
SideOutputDataStream<WaterSensor> sideOutput_2 = ds_1.getSideOutput(s2);
sideOutput_1.printToErr("s1");
sideOutput_2.printToErr("s2");
environment.execute();
}
}
合流
- union
DataStreamSource<Integer> ds_1 = environment.fromElements(1, 2, 3);
DataStreamSource<Integer> ds_2 = environment.fromElements(2, 2, 3);
DataStreamSource<String> ds_3 = environment.fromElements("2", "2", "3");
ds_1.union(ds_2,ds_3.map(Integer::valueOf)).print();
- connect
/*
* TODO 使用connect回流
* 1、 一次只能连接 2 条流
* 2、 流的数据类型可以不一样
* 3、 连接后可以调用map、flatmap等处理,但是各处理各的
* */
ConnectedStreams<Integer, String> connect = source_1.connect(source_2);
SingleOutputStreamOperator<String> result = connect.map(new CoMapFunction<Integer, String, String>() {
@Override
public String map1(Integer integer) throws Exception {
return "来源于数字流:" + integer.toString();
}
@Override
public String map2(String s) throws Exception {
return "来源于字母流:" + s;
}
});
package ac.sict.reid.leo.Computing.merge;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoProcessFunction;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ConnectByKeyDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
environment.setParallelism(2);
DataStreamSource<Tuple2<Integer, String>> source_1 = environment.fromElements(
Tuple2.of(1, "a1"),
Tuple2.of(1, "a2"),
Tuple2.of(2, "b"),
Tuple2.of(3, "c")
);
DataStreamSource<Tuple3<Integer, String, Integer>> source_2 = environment.fromElements(
Tuple3.of(1, "aa1", 1),
Tuple3.of(1, "aa2", 2),
Tuple3.of(2, "bb", 1),
Tuple3.of(3, "cc", 1)
);
ConnectedStreams<Tuple2<Integer, String>, Tuple3<Integer, String, Integer>> connect = source_1.connect(source_2);
ConnectedStreams<Tuple2<Integer, String>, Tuple3<Integer, String, Integer>> keyedConnect = connect.keyBy(s1 -> s1.f0, s2 -> s2.f0);
SingleOutputStreamOperator<String> result = keyedConnect.process(new CoProcessFunction<Tuple2<Integer, String>, Tuple3<Integer, String, Integer>, String>() {
private Map<Integer, List<Tuple2<Integer,String>>> s1Cache = new HashMap<>();
private Map<Integer, List<Tuple3<Integer,String,Integer>>> s2Cache = new HashMap<>();
@Override
public void processElement1(Tuple2<Integer, String> value, CoProcessFunction<Tuple2<Integer, String>, Tuple3<Integer, String, Integer>, String>.Context context, Collector<String> collector) throws Exception {
Integer id = value.f0;
if (!s1Cache.containsKey(id)){
List<Tuple2<Integer,String>> s1Values = new ArrayList<>();
s1Values.add(value);
s1Cache.put(id,s1Values);
}
else {
s1Cache.get(id).add(value);
}
if(s2Cache.containsKey(id)){
for (Tuple3<Integer,String,Integer> s2Element: s2Cache.get(id)){
collector.collect("s1" + value + "<-------------------> s2:" + s2Element);
}
}
}
@Override
public void processElement2(Tuple3<Integer, String, Integer> value, CoProcessFunction<Tuple2<Integer, String>, Tuple3<Integer, String, Integer>, String>.Context context, Collector<String> collector) throws Exception {
Integer id = value.f0;
if (!s2Cache.containsKey(id)){
List<Tuple3<Integer,String,Integer>> s2Values = new ArrayList<>();
s2Values.add(value);
s2Cache.put(id,s2Values);
}
else {
s2Cache.get(id).add(value);
}
if (s1Cache.containsKey(id)){
for (Tuple2<Integer,String> s1Element:s1Cache.get(id)){
collector.collect("s1:"+s1Element + "<------------------> s2:" + value);
}
}
}
});
result.print();
environment.execute();
}
}
输出
- 输出到文件
package ac.sict.reid.leo.Sink;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringEncoder;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy;
import org.apache.flink.configuration.MemorySize;
import org.apache.flink.connector.datagen.source.DataGeneratorSource;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.core.fs.Path;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.connector.datagen.source.GeneratorFunction;
import org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig;
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.DateTimeBucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy;
import java.time.Duration;
import java.time.ZoneId;
public class SinkFile {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
environment.setParallelism(2);
// 开启Checkpoint
environment.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE);
DataGeneratorSource<String> dataGeneratorSource = new DataGeneratorSource<String>(
new GeneratorFunction<Long, String>() {
@Override
public String map(Long aLong) throws Exception {
return "Number:" + aLong;
}
}, Long.MAX_VALUE,RateLimiterStrategy.perSecond(1000),Types.STRING);
DataStreamSource<String> dataGen = environment.fromSource(dataGeneratorSource,
WatermarkStrategy.noWatermarks(), "data-generator");
/*
* WatermarkStrategy.noWatermarks(): 这是用于指定水位线生成策略的部分。
* 水位线在流处理中用于处理事件时间(event time)的概念,它可以帮助系统进行事件的有序处理和时间延迟处理。
* noWatermarks(),这表示没有特定的水位线生成策略,也就是没有对事件时间进行特殊处理。
* */
FileSink<String> fileSink = FileSink.<String>forRowFormat(
//设置文件输出行存储格式,此外Flink支持批量编码形式:forBulkFormat
new Path("./data"), new SimpleStringEncoder<>("UTF-8")
).withOutputFileConfig(
OutputFileConfig.builder().withPartPrefix("sict-reid-").withPartSuffix(".log").build()
).withBucketAssigner(
//按照目录分桶:每小时一个目录
new DateTimeBucketAssigner<>("yyyy-MM-dd HH", ZoneId.systemDefault())
).withRollingPolicy(
//文件滚动策略:1 分钟
DefaultRollingPolicy.builder().withRolloverInterval(Duration.ofMinutes(1)).withMaxPartSize(
new MemorySize(1024*1024)
).build()
).build();
dataGen.sinkTo(fileSink);
environment.execute();
}
}
- 输出到Kakfa
package ac.sict.reid.leo.Sink;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.connector.base.DeliveryGuarantee;
import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema;
import org.apache.flink.connector.kafka.sink.KafkaSink;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import javax.annotation.Nullable;
import java.nio.charset.StandardCharsets;
public class SinkKafkaWithKey {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
environment.setParallelism(1);
environment.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE);
environment.setRestartStrategy(RestartStrategies.noRestart());
DataStreamSource<String> stream = environment.socketTextStream("localhost", 7777);
KafkaSink<String> kafkaSink = KafkaSink.<String>builder().setBootstrapServers("localhost:9092").
setRecordSerializer(
new KafkaRecordSerializationSchema<String>() {
@Nullable
@Override
public ProducerRecord<byte[], byte[]> serialize(String element, KafkaSinkContext context, Long timestamp) {
String[] data = element.split(",");
byte[] key = data[0].getBytes(StandardCharsets.UTF_8);
byte[] value = element.getBytes(StandardCharsets.UTF_8);
return new ProducerRecord<>("ws", key, value);
}
}
).setDeliveryGuarantee(DeliveryGuarantee.EXACTLY_ONCE).
setTransactionalIdPrefix("sict-reid-").
setProperty(ProducerConfig.TRANSACTION_TIMEOUT_CONFIG, 10 * 60 * 1000 + "").build();
stream.sinkTo(kafkaSink);
environment.execute();
}
}