【Flink】算子总结

慕青Voyager

已于 2023-08-27 10:30:16 修改

阅读量131

点赞数

文章标签： flink java 大数据

于 2023-08-27 10:27:34 首次发布

本文链接：https://blog.csdn.net/weixin_44664746/article/details/132520995

版权

Flink 算子总结

创建执行环境

StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();

源算子

// socket源读取数据
DataStreamSource<String> streamSource = environment.socketTextStream("localhost", 7777);

// 文件读取数据
FileSource<String> fileSource = FileSource.forRecordStreamFormat(new TextLineInputFormat(),
                                new Path("data/words.txt")).build();
environment.fromSource(fileSource, WatermarkStrategy.noWatermarks(),"file").print();
                                //也可以是目录，读取该目录下所有文件；也可以读取HDFS，使用hdfs://...

//从集合读取
List<Integer> data = Arrays.asList(1, 22, 3);
DataStreamSource<Integer> ds = environment.fromCollection(data);

//从Kafka读取数据
KafkaSource<String> kafkaSource = KafkaSource.<String>builder().
                setBootstrapServers("localhost:9092").setTopics("topic_1").
                setStartingOffsets(OffsetsInitializer.latest()).
                setValueOnlyDeserializer(new SimpleStringSchema()).build();
DataStreamSource<String> stream = environment.fromSource(kafkaSource,
                                                         WatermarkStrategy.noWatermarks(), 
                                                         "kafka-source");

转换算子

stream.map(new MapFunction<WaterSensor, String>() {
            @Override
            public String map(WaterSensor waterSensor) throws Exception {
                return waterSensor.id;
            }
        }).print();

filter

stream.filter(new FilterFunction<WaterSensor>() {
            @Override
            public boolean filter(WaterSensor waterSensor) throws Exception {
                return waterSensor.id.equals("sensor_1");
            }
        }).print();

flatmap

stream.flatMap(new FlatMapFunction<WaterSensor, Object>() {
            @Override
            public void flatMap(WaterSensor waterSensor, Collector<Object> collector) throws Exception {
                if (waterSensor.id.equals("sensor_1")){
                    collector.collect(String.valueOf(waterSensor.vc));
                } else if (waterSensor.id.equals("sensor_2")) {
                    collector.collect(String.valueOf(waterSensor.ts));
                    collector.collect(String.valueOf(waterSensor.vc));
                }
            }
        }).print();

聚合

keyBy

KeyedStream<WaterSensor, String> keyedStream = stream.keyBy(new KeySelector<WaterSensor, String>() {
            @Override
            public String getKey(WaterSensor waterSensor) throws Exception {
                return waterSensor.id;
            }
        });

reduce

environment.socketTextStream("localhost",7777).
                map(new WaterSensorMapFunction()).keyBy(WaterSensor::getId).
                reduce(new ReduceFunction<WaterSensor>() {
                    @Override
                    public WaterSensor reduce(WaterSensor ws_1, WaterSensor ws_2) throws Exception {
                        System.out.println("Demo7_Reduce.reduce");
                        int max = Math.max(ws_1.getVc(), ws_2.getVc());
                        if (ws_1.getVc() > ws_2.getVc()){
                            ws_1.setVc(max);
                            return ws_1;
                        }
                        else {
                            ws_2.setVc(max);
                            return ws_2;
                        }
                    }
                }).print();

用户自定义

Function Classes

SingleOutputStreamOperator<WaterSensor> filter = stream.filter(new UserFilter());

public static class UserFilter implements FilterFunction<WaterSensor>{
        @Override
        public boolean filter(WaterSensor waterSensor) throws Exception {
            return waterSensor.id.equals("sensor_1");
        }
    }

SingleOutputStreamOperator<WaterSensor> filter = stream.filter(new FilterFunction<WaterSensor>() {
            @Override
            public boolean filter(WaterSensor waterSensor) throws Exception {
                return waterSensor.id.equals("sensor_1");
            }
        });

Rich Function Classes

environment.fromElements(1,2,3,4).map(new RichMapFunction<Integer, Integer>() {

            /**
             *
             * Rich Function 的生命周期概念
             * open方法：Rich Function的初始化方法，开启一个算子的生命周期，当一个算子实际工作方法被调用之前，open会首先被调用。
             *
             * @param parameters
             * @throws Exception
             */
            @Override
            public void open(Configuration parameters) throws Exception {
                super.open(parameters);
                System.out.println("索引是："+getRuntimeContext().getIndexOfThisSubtask()+"的任务的生命周期开始");
            }

            @Override
            public Integer map(Integer integer) throws Exception {
                return integer + 1;
            }


            /**
             *
             * close方法：生命周期中最后一个调用的方法，类似于结束方法，用来做一些清理工作。
             *
             * @throws Exception
             */
            @Override
            public void close() throws Exception {
                super.close();
                System.out.println("索引是："+getRuntimeContext().getIndexOfThisSubtask()+"的任务的生命周期结束");
            }
        }).print();

物理分区

/*
* TODO shuffle：
* 把流中数据完全随机打乱，均匀传递到下游任务分区。
* */
stream.shuffle().print();
/*
* TODO rebalance：
* 把流中数据按照先后顺序依次分发，实现轮询重分区。
* 使用Round-Robin负载均衡算法
* */
stream.rebalance().print();
/*
* TODO broadcast：
* 数据在不同的分区都保留一份，将输入数据复制并发送到下游算子并行任务中。
* */
stream.broadcast().print();
/*
* TODO global:
* 全局分区，将所有输入流数据都发送到下游算子的第一个并行子任务中（让下游任务并行度变为 1 ）
* 慎重使用
* */
stream.global().print();

自定义分区

DataStream<String> stringDataStream = stream.partitionCustom(new MyPartitioner(), value -> value);

public class MyPartitioner implements Partitioner<String> {
    @Override
    public int partition(String s, int i) {
        return Integer.parseInt(s) % i;
    }
}

分流

public class SplitStreamByOutputTag {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
        SingleOutputStreamOperator<WaterSensor> ds = environment.socketTextStream("localhost", 7777).map(new WaterSensorMapFunction());
        OutputTag<WaterSensor> s1 = new OutputTag<WaterSensor>("s1", Types.POJO(WaterSensor.class)) {};
        OutputTag<WaterSensor> s2 = new OutputTag<WaterSensor>("s2", Types.POJO(WaterSensor.class)) {};
        SingleOutputStreamOperator<WaterSensor> ds_1 = ds.process(new ProcessFunction<WaterSensor, WaterSensor>() {
            @Override
            public void processElement(WaterSensor waterSensor, ProcessFunction<WaterSensor, WaterSensor>.Context context, Collector<WaterSensor> collector) throws Exception {
                if ("s1".equals(waterSensor.getId())) {
                    context.output(s1, waterSensor);
                } else if ("s2".equals(waterSensor.getId())) {
                    context.output(s2, waterSensor);
                } else {
                    collector.collect(waterSensor);
                }
            }
        });
        ds_1.print("主流，非s1,s2的传感器");
        SideOutputDataStream<WaterSensor> sideOutput_1 = ds_1.getSideOutput(s1);
        SideOutputDataStream<WaterSensor> sideOutput_2 = ds_1.getSideOutput(s2);
        sideOutput_1.printToErr("s1");
        sideOutput_2.printToErr("s2");
        environment.execute();
    }
}

合流

union

DataStreamSource<Integer> ds_1 = environment.fromElements(1, 2, 3);
DataStreamSource<Integer> ds_2 = environment.fromElements(2, 2, 3);
DataStreamSource<String> ds_3 = environment.fromElements("2", "2", "3");
ds_1.union(ds_2,ds_3.map(Integer::valueOf)).print();

connect

/*
* TODO 使用connect回流
* 1、 一次只能连接 2 条流
* 2、 流的数据类型可以不一样
* 3、 连接后可以调用map、flatmap等处理，但是各处理各的
* */
ConnectedStreams<Integer, String> connect = source_1.connect(source_2);
SingleOutputStreamOperator<String> result = connect.map(new CoMapFunction<Integer, String, String>() {
  @Override
  public String map1(Integer integer) throws Exception {
    return "来源于数字流:" + integer.toString();
  }
  @Override
  public String map2(String s) throws Exception {
    return "来源于字母流:" + s;
  }
});

package ac.sict.reid.leo.Computing.merge;

import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoProcessFunction;
import org.apache.flink.util.Collector;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class ConnectByKeyDemo {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
        environment.setParallelism(2);

        DataStreamSource<Tuple2<Integer, String>> source_1 = environment.fromElements(
                Tuple2.of(1, "a1"),
                Tuple2.of(1, "a2"),
                Tuple2.of(2, "b"),
                Tuple2.of(3, "c")
        );

        DataStreamSource<Tuple3<Integer, String, Integer>> source_2 = environment.fromElements(
                Tuple3.of(1, "aa1", 1),
                Tuple3.of(1, "aa2", 2),
                Tuple3.of(2, "bb", 1),
                Tuple3.of(3, "cc", 1)
        );

        ConnectedStreams<Tuple2<Integer, String>, Tuple3<Integer, String, Integer>> connect = source_1.connect(source_2);
        ConnectedStreams<Tuple2<Integer, String>, Tuple3<Integer, String, Integer>> keyedConnect = connect.keyBy(s1 -> s1.f0, s2 -> s2.f0);
        SingleOutputStreamOperator<String> result = keyedConnect.process(new CoProcessFunction<Tuple2<Integer, String>, Tuple3<Integer, String, Integer>, String>() {

            private Map<Integer, List<Tuple2<Integer,String>>> s1Cache = new HashMap<>();
            private Map<Integer, List<Tuple3<Integer,String,Integer>>> s2Cache = new HashMap<>();
            @Override
            public void processElement1(Tuple2<Integer, String> value, CoProcessFunction<Tuple2<Integer, String>, Tuple3<Integer, String, Integer>, String>.Context context, Collector<String> collector) throws Exception {
                Integer id = value.f0;
                if (!s1Cache.containsKey(id)){
                    List<Tuple2<Integer,String>> s1Values = new ArrayList<>();
                    s1Values.add(value);
                    s1Cache.put(id,s1Values);
                }
                else {
                    s1Cache.get(id).add(value);
                }
                if(s2Cache.containsKey(id)){
                    for (Tuple3<Integer,String,Integer> s2Element: s2Cache.get(id)){
                        collector.collect("s1" + value + "<-------------------> s2:" + s2Element);
                    }
                }
            }

            @Override
            public void processElement2(Tuple3<Integer, String, Integer> value, CoProcessFunction<Tuple2<Integer, String>, Tuple3<Integer, String, Integer>, String>.Context context, Collector<String> collector) throws Exception {
                Integer id = value.f0;
                if (!s2Cache.containsKey(id)){
                    List<Tuple3<Integer,String,Integer>> s2Values = new ArrayList<>();
                    s2Values.add(value);
                    s2Cache.put(id,s2Values);
                }
                else {
                    s2Cache.get(id).add(value);
                }
                if (s1Cache.containsKey(id)){
                    for (Tuple2<Integer,String> s1Element:s1Cache.get(id)){
                        collector.collect("s1:"+s1Element + "<------------------> s2:" + value);
                    }
                }
            }
        });

        result.print();
        environment.execute();
    }
}

输出

输出到文件

package ac.sict.reid.leo.Sink;

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringEncoder;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy;
import org.apache.flink.configuration.MemorySize;
import org.apache.flink.connector.datagen.source.DataGeneratorSource;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.core.fs.Path;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.connector.datagen.source.GeneratorFunction;
import org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig;
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.DateTimeBucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy;

import java.time.Duration;
import java.time.ZoneId;

public class SinkFile {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
        environment.setParallelism(2);
        // 开启Checkpoint
        environment.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE);



        DataGeneratorSource<String> dataGeneratorSource = new DataGeneratorSource<String>(
                new GeneratorFunction<Long, String>() {
                    @Override
                    public String map(Long aLong) throws Exception {
                        return "Number:" + aLong;
                    }
                }, Long.MAX_VALUE,RateLimiterStrategy.perSecond(1000),Types.STRING);

        DataStreamSource<String> dataGen = environment.fromSource(dataGeneratorSource,
                WatermarkStrategy.noWatermarks(), "data-generator");
        /*
        * WatermarkStrategy.noWatermarks(): 这是用于指定水位线生成策略的部分。
        *   水位线在流处理中用于处理事件时间（event time）的概念，它可以帮助系统进行事件的有序处理和时间延迟处理。
        *   noWatermarks()，这表示没有特定的水位线生成策略，也就是没有对事件时间进行特殊处理。
        * */



        FileSink<String> fileSink = FileSink.<String>forRowFormat(
                                                    //设置文件输出行存储格式，此外Flink支持批量编码形式：forBulkFormat
                new Path("./data"), new SimpleStringEncoder<>("UTF-8")
        ).withOutputFileConfig(
                OutputFileConfig.builder().withPartPrefix("sict-reid-").withPartSuffix(".log").build()
        ).withBucketAssigner(
                //按照目录分桶：每小时一个目录
                new DateTimeBucketAssigner<>("yyyy-MM-dd HH", ZoneId.systemDefault())
        ).withRollingPolicy(
                //文件滚动策略：1 分钟
                DefaultRollingPolicy.builder().withRolloverInterval(Duration.ofMinutes(1)).withMaxPartSize(
                        new MemorySize(1024*1024)
                ).build()
        ).build();


        dataGen.sinkTo(fileSink);
        environment.execute();
    }
}

输出到Kakfa

package ac.sict.reid.leo.Sink;

import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.connector.base.DeliveryGuarantee;
import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema;
import org.apache.flink.connector.kafka.sink.KafkaSink;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;

import javax.annotation.Nullable;
import java.nio.charset.StandardCharsets;

public class SinkKafkaWithKey {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment environment = StreamExecutionEnvironment.getExecutionEnvironment();
        environment.setParallelism(1);
        environment.enableCheckpointing(2000, CheckpointingMode.EXACTLY_ONCE);
        environment.setRestartStrategy(RestartStrategies.noRestart());

        DataStreamSource<String> stream = environment.socketTextStream("localhost", 7777);

        KafkaSink<String> kafkaSink = KafkaSink.<String>builder().setBootstrapServers("localhost:9092").
                setRecordSerializer(
                        new KafkaRecordSerializationSchema<String>() {
                            @Nullable
                            @Override
                            public ProducerRecord<byte[], byte[]> serialize(String element, KafkaSinkContext context, Long timestamp) {
                                String[] data = element.split(",");
                                byte[] key = data[0].getBytes(StandardCharsets.UTF_8);
                                byte[] value = element.getBytes(StandardCharsets.UTF_8);
                                return new ProducerRecord<>("ws", key, value);
                            }
                        }
                ).setDeliveryGuarantee(DeliveryGuarantee.EXACTLY_ONCE).
                setTransactionalIdPrefix("sict-reid-").
                setProperty(ProducerConfig.TRANSACTION_TIMEOUT_CONFIG, 10 * 60 * 1000 + "").build();

        stream.sinkTo(kafkaSink);
        environment.execute();
    }
}