flink java api(一)

Index

1 data-source

//local file source
DataStreamSource<String> textFile = env.readTextFile("D:\\work\\projects\\flink-demo\\src\\main\\resources\\adclick.txt"
);

//socket source
DataStreamSource<String> textStream = env.socketTextStream("fk-1", 9099);

1.1 kafk-source


public class KafkaSourceTest {
    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //指定checkpoint的触发间隔
        env.enableCheckpointing(5000);
        // 默认的CheckpointingMode就是EXACTLY_ONCE,也可以指定为AT_LEAST_ONCE
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);

        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "fk-1:9092");

        //设置消费者组
        properties.setProperty("group.id", "flink-group");

        //消费的三种方式,默认是latest
        //earliest:各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
        //latest:各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
        //none: topic各分区都存在已提交的offset时,从offset后开始消费;只要有一个分区不存在已提交的offset,则抛出异常

        properties.setProperty("auto.offset.reset", "earliest");

//        FlinkKafkaConsumer kafkaConsumer = new FlinkKafkaConsumer<>("test", new SimpleStringSchema(), properties);
//        DataStreamSource stream = env.addSource(kafkaConsumer).setParallelism(2);


        //消费多个topic
        List<String> topics = new ArrayList<>();
        topics.add("test");
        topics.add("words");
        DataStream<String> streamBatch = env
                .addSource(new FlinkKafkaConsumer<>(topics, new SimpleStringSchema(), properties));

        streamBatch.print();
        env.execute("comsumer start");
    }
}

1.2 hadoop-source

public class HadoopFileTest {
    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(2);

        DataStreamSource<String> streamFromDFS = env.readTextFile("hdfs://fk-1:9000/in/adclick.txt");

        streamFromDFS.print();

        env.execute("read content from hadoop file");
    }
}

1.3 udfkafka-source

public class UdfKafkaSourceTest {
    public static void main(String[] args) throws Exception{

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //自定义checkpoin出发时间
        env.enableCheckpointing(5000);
        //默认的CheckpointingMode就是EXACTLY_ONCE,也可以设置为AT_LEAST_ONCE
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);

        Properties prop = new Properties();
        prop.setProperty("bootstrap.servers","fk-1:9092");
        prop.setProperty("group.id","flink_consumer");
        prop.setProperty("auto.offset.reset","latest");

        //消费多个topic
        List<String> topics = new ArrayList<>();
        topics.add("test");

        DataStreamSource<ConsumerRecord<String, String>> udfStream = env.addSource(new FlinkKafkaConsumer<>(topics, new MessageDeSerializationSchema(), prop));
        udfStream.map(new MapFunction<ConsumerRecord<String, String>, Object>() {
            @Override
            public Object map(ConsumerRecord<String, String> record) throws Exception {
                return new Tuple2<String, String>(record.key(),record.value());
            }
        }).print();

        env.execute("udf-consumer");
    }
}

//udf MessageDeSerializationSchema

public class MessageDeSerializationSchema implements KafkaDeserializationSchema<ConsumerRecord<String, String>> {

    @Override
    public boolean isEndOfStream(ConsumerRecord<String, String> stringStringConsumerRecord) {
        return false;
    }

    @Override
    public ConsumerRecord<String, String> deserialize(ConsumerRecord<byte[], byte[]> record) throws Exception {
        //返回ConsumerRecord
        String key = null;
        String value = null;
        if (record.key() != null) {
            key = new String(record.key());
        }

        if (record.value() != null) {
            value = new String(record.value());
        }
        return new ConsumerRecord<>(
                record.topic(),
                record.partition(),
                record.offset(),
                key,
                value);
    }

    @Override
    public TypeInformation<ConsumerRecord<String, String>> getProducedType() {
        return TypeInformation.of(new TypeHint<ConsumerRecord<String, String>>() {
        });
    }

}

2 data-sink

Note that the write*() methods on DataStream are mainly intended for debugging purposes. They are not participating in Flink’s checkpointing, this means these functions usually have at-least-once semantics. The data flushing to the target system depends on the implementation of the OutputFormat. This means that not all elements send to the OutputFormat are immediately showing up in the target system. Also, in failure cases, those records might be lost.
         * For reliable, exactly-once delivery of a stream into a file system, use the StreamingFileSink. Also, custom implementations through the .addSink(...) method can participate in Flink’s checkpointing for exactly-once semantics.

2.1 kafak-source-hdfs-sink

public class KafkaSourceHdfsSinkTest {
    public static void main(String[] args) throws Exception{

        //flink env
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.enableCheckpointing(50000);
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);

        //kafka prop
        Properties prop = new Properties();
        prop.setProperty("bootstrap.servers", "fk-1:9092");
        prop.setProperty("group.id", "flink_consumer");
        prop.setProperty("auto.offset.reset", "latest");
        prop.setProperty("enable.auto.commit", "true");

        //kafka topics
        ArrayList<String> topics = new ArrayList<>();
        topics.add("words");

        //flink link to kafka
        DataStreamSource<String> lines = env.addSource(new FlinkKafkaConsumer<>(topics, new SimpleStringSchema(), prop));

        //datastream operation: wordcount
        SingleOutputStreamOperator<Tuple2<String, Integer>> rs = lines.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
                String[] words = s.split(" ");
                Arrays.stream(words).forEach(word -> {
                    collector.collect(new Tuple2<>(word, 1));
                });
            }
        }).keyBy(t -> t.f0).sum(1);
        // show in cosole
        rs.print();

        //save to hdfs
        String hdfsPath="hdfs://fk-1:9000/yyds";
        StreamingFileSink<Tuple2<String, Integer>> hdfsSink = StreamingFileSink.forRowFormat(new Path(hdfsPath), new SimpleStringEncoder<Tuple2<String, Integer>>("UTF-8"))
                .withRollingPolicy(DefaultRollingPolicy.builder()
                        .withInactivityInterval(TimeUnit.MINUTES.toMillis(15))
                        .withInactivityInterval(TimeUnit.MINUTES.toMillis(5))
                        .withMaxPartSize(1024 * 1024 * 1024)
                        .build()).withBucketAssigner(new DateTimeBucketAssigner<>("yyyy-MM-dd-HH-mm")).build();

        rs.addSink(hdfsSink).setParallelism(1);
        env.execute();
    }
}

2.2 hdfs-sink

public class HdfsSinkTest {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.enableCheckpointing(500);
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.AT_LEAST_ONCE);
        
        String localPath = "D:\\work\\projects\\flink-demo\\src\\main\\resources";
        DataStreamSource<String> inputStream = env.readTextFile(localPath);
        SingleOutputStreamOperator<Tuple2<String, Integer>> sum = inputStream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
                String[] words = s.split(",");
                collector.collect(new Tuple2<>(words[2], 1));
            }
        }).keyBy(t -> t.f0).sum(1);
        /**
         * 保存到hdfs文件系统
         */
        String hdfsPath = "hdfs://fk-1:9000/in";
        StreamingFileSink<Tuple2<String, Integer>> hdfsSink = StreamingFileSink.forRowFormat(new Path(hdfsPath), new SimpleStringEncoder<Tuple2<String, Integer>>("UTF-8"))
                .withRollingPolicy(DefaultRollingPolicy.builder()
                        .withInactivityInterval(TimeUnit.MINUTES.toMillis(15))
                        .withInactivityInterval(TimeUnit.MINUTES.toMillis(5))
                        .withMaxPartSize(1024 * 1024 * 1024)
                        .build()).withBucketAssigner(new DateTimeBucketAssigner<>("yyyy-MM-dd-HH-mm")).build();

        sum.addSink(hdfsSink).setParallelism(1);
        env.execute();

    }
}

2.3 redis-sink

public class RedisSink {
    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.enableCheckpointing(500);
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.AT_LEAST_ONCE);

        //kafka prop
        Properties prop = new Properties();
        prop.setProperty("bootstrap.servers", "fk-1:9092");
        prop.setProperty("group.id", "flink_consumer");
        prop.setProperty("auto.offset.reset", "latest");
        prop.setProperty("enable.auto.commit", "true");

        //kafka topics
        ArrayList<String> topics = new ArrayList<>();
        topics.add("words");

        //flink link to kafka
        DataStreamSource<String> lines = env.addSource(new FlinkKafkaConsumer<>(topics, new SimpleStringSchema(), prop));
        //datastream operation

        //jedis prop
        FlinkJedisPoolConfig jediConfig = new FlinkJedisPoolConfig.Builder()
                .setHost("fk-1")
                .setPort(6379)
                .setDatabase(0)
                .build();
        // save data to redis
        lines.addSink(new org.apache.flink.streaming.connectors.redis.RedisSink(jediConfig,new myRedisMapper()));
        env.execute();

    }

    static class myRedisMapper implements RedisMapper<Tuple2<String,Integer>> {
        /**
         * 设置使用的redis数据结构类型,和key的名词
         * 通过RedisCommand设置数据结构类型
         * Returns descriptor which defines data type.
         */
        @Override
        public RedisCommandDescription getCommandDescription() {
            //把数据存储到redis的时候,使用HASH类型。第二个参数指的是HASH类型的大key
            return new RedisCommandDescription(RedisCommand.HSET,"flink_test");
        }

        /**
         * 设置value中的键值对 key的值
         * Extracts key from data.
         */
        @Override
        public String getKeyFromData(Tuple2<String,Integer> t) {
            //HASH类型中的小key
            return t.f0;
        }

        /**
         * 设置value中的键值对 value的值
         * Extracts value from data.
         */
        @Override
        public String getValueFromData(Tuple2<String, Integer> t) {
            return t.f1+"";
        }
    }
}

2.4 kafka-sink

public class KafkaSinkTest {
    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.enableCheckpointing(50000,CheckpointingMode.EXACTLY_ONCE);
        //env.enableCheckpointing(500);
        //env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.AT_LEAST_ONCE);

        //kafka prop
        Properties prop = new Properties();
        prop.setProperty("bootstrap.servers", "fk-1:9092");
        prop.setProperty("group.id", "flink_consumer");
        prop.setProperty("auto.offset.reset", "latest");
        prop.setProperty("enable.auto.commit", "true");

        ArrayList<String> topics = new ArrayList<>();
        topics.add("words");

        //flink link to kafka (source)
        DataStreamSource<String> lines = env.addSource(new FlinkKafkaConsumer<>(topics, new SimpleStringSchema(), prop));
        lines.print();

        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "fk-1:9092");
        properties.setProperty("zookeeper.connect", "fk-1:2181");
        properties.setProperty("group.id", "flink_consumer");

        //sink to test topic (sink)
        lines.addSink(new FlinkKafkaProducer<String>("fk-1:9092","test",new SimpleStringSchema())).name("flink-kafka").setParallelism(1);

        env.execute();
    }
}

3 operator

  • Map:获取一个元素并生成一个元素,元素的个数个数不会改变。
lines.map(new MapFunction<String, Tuple2<String,Integer>>() {
            @Override
            public Tuple2<String, Integer> map(String s) throws Exception {
                return new Tuple2<>(s,1);
            }
        });
  • FlatMap:获取一个元素并生成零个、一个或多个元素(扁平化),元素的个数会改变。
lines.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
                String[] words = s.split(" ");
                Arrays.stream(words).forEach(word -> {
                //返回多个Tuple元组
                    collector.collect(new Tuple2<>(word, 1));
                });
            }
        })
  • Fliter:获取一个元素进行计算,结果为true则保留(过滤),元素的个数会改变。
lines.filter(new FilterFunction<String>() {
            @Override
            public boolean filter(String s) throws Exception {
                return s.contains("0")?true:false;
            }
        })
  • Union: 获取两个或多个数据流,生成一个包含所有流中所有元素的新流。如果将一个数据流与其自身合并,那么在结果流中将获得每个元素两次,连接的流必须类型一致。
//t1:(string,string),t2:(string,string)
        t1.union(t2).filter(new FilterFunction<Tuple2<String, String>>() {
            @Override
            public boolean filter(Tuple2<String, String> t) throws Exception {
                return t.f1.contains("a")?true:false;
            }
        }).print();
  • process:一般通过ProcessFunction结合侧边输出(getSideOutput)的概念完成SplitStream
//定义一个输出标签
        OutputTag<String> errorTag = new OutputTag<String>("error"){};
        OutputTag<String> correctTag = new OutputTag<String>("correct"){};
        OutputTag<String> all = new OutputTag<String>("all"){};
        SingleOutputStreamOperator<String> splitStream = t1.process(new ProcessFunction<Tuple2<String, String>, String>() {
            @Override
            public void processElement(Tuple2<String, String> t, Context context, Collector<String> collector) throws Exception {
                if (t.f1.contains("1766")) {
                    context.output(errorTag, t.f0);
                }
                if (t.f1.contains("1756")) {
                    context.output(correctTag, t.f0);
                }
                //消息发给下游
                collector.collect(t.f1);
            }
        });
        splitStream.getSideOutput(errorTag).printToErr("错误信息");
        splitStream.getSideOutput(correctTag).printToErr("正确信息");
        splitStream.print("所有信息");
  • Keyby:将流按相同的键划分为不相交的分区,分组。
tupeStream.keyBy(x-> x.f0).reduce(new ReduceFunction<Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> reduce(Tuple2<String, Integer> t1, Tuple2<String, Integer> t2) throws Exception {
                return new Tuple2<>(t1.f0,t1.f1+t2.f1);
            }
        }).print();
  • Aggregations:在KeyedSteam上滚动聚,max和maxBy的区别:max只会返回一个值,maxBy会返回整个元素。
keyedStream.sum(0)
keyedStream.sum("key")
keyedStream.min(0)
keyedStream.min("key")
keyedStream.max(0)
keyedStream.max("key")
keyedStream.minBy(0)
keyedStream.minBy("key")
keyedStream.maxBy(0)
keyedStream.maxBy("key")
// example
tupeStream.keyBy(x -> x.f0).max(1).print();
tupeStream.keyBy(x -> x.f0).maxBy(1).print();

4 state-fault-tolerance

在Flink中有两种基本类型的state,分别是Keyed State和Operator State。Keyed State只能应用在KeyedStream上的操作。每一个keyed operator都会绑定一个或多个状态值。Operator State又被称为non-keyed state,每一个算子都会有对应的operator state。Keyed State以及Operator State都会以两种方式存储:managed和raw。

managed state:由Flink控制state的数据结构,比如使用内部hash表、RocksDB等。正是基于此,Flink可以更好地在managed state基础上进行内存优化和故障恢复。

raw state:Flink只知道state是一些字节数组,其余一无所知。需要用户自己完成state的序列化以及反序列化。因此,Flink不能基于raw state进行内存优化以及故障恢复。

4.1 managed-keyed-state

managed keyed state 接口提供了对不同数据类型的state的访问,这些state都是和key绑定的。这也就意味着managed keyed state只能应用在KeyedStream上。Flink内置的有以下几种managed keyed state。

类型使用场景方法
ValueState该状态用于存储单一状态值update(T)
T value()
clear()
ListState该状态用于存储集合状态值add(T) get()
update(List) clear()
MapState<UK, UV>该状态用于存储Map集合状态值put(UK, UV) get(UK) …
ReducingState该状态用于存储单一状态值。该状态会通过调用用户提供的ReduceFunction,将添加的元素和历史状态自动做运算add(T) T get()
clear()
AggregatingState<IN, OUT>该状态用于存储单一状态值。该状态会通过调用用户提供的AggregateFunction,将添加的元素和历史状态自动做运算。该状态和ReducingState不同点在于,输入数据类型和输出数据类型可以不同add(IN) OUT get() clear()
想要获取状态必须先创建StateDescriptor,几个获取状态的方法:
ValueState<T> getState(ValueStateDescriptor<T>)
ReducingState<T> getReducingState(ReducingStateDescriptor<T>)
ListState<T> getListState(ListStateDescriptor<T>)
AggregatingState<IN, OUT> getAggregatingState(AggregatingStateDescriptor<IN, ACC, OUT>)
MapState<UK, UV> getMapState(MapStateDescriptor<UK, UV>)

实现步骤:
1.写一个类,继承RichMapFunction类;
2. 重写RichMapFunction里面的open方法在open方法中,通过RuntimeContext对象的getXxxState(XxxStateDescriptor)方法获取到XxxState对象;
3. 实现RichMapFunction里面的map方法
在map方法中,通过XxxState对象根据业务需要实现具体功能;
4. 在代码中的KeyedStream上使用自定义的MapFunction;

4.2 value-state

public class ValueStateRichFunction extends RichMapFunction<Tuple2<String,Integer>,Tuple2<String,Integer>> {

    ValueState state = null;
    @Override
    public void open(Configuration parameters) throws Exception {
        //在open方法里,创建valueState对象
        RuntimeContext runtimeContext = getRuntimeContext();
        ValueStateDescriptor<Tuple2<String, Integer>> descriptor = new ValueStateDescriptor<>("valueStateDescriptor", TypeInformation.of(new TypeHint<Tuple2<String, Integer>>() {
        }));
        state=runtimeContext.getState(descriptor);
    }

    /**
     * 使用state 完成wordcount
     * @param value
     * @return
     * @throws Exception
     */
    @Override
    public Tuple2<String, Integer> map(Tuple2<String, Integer> value) throws Exception {
        //获取到valueState中存储的数据
        Tuple2<String,Integer> historyData = (Tuple2<String,Integer>)state.value();
        //把当前输入的数据(value),和valueState中的数据累加,把累加之后的数据放入到valueState中
        if (historyData == null){
            state.update(value);
        }else {
            state.update(new Tuple2<String,Integer>(value.f0,value.f1+historyData.f1));
        }
        return (Tuple2<String, Integer>)state.value();
    }
}
public class ValueStateWordCountTest {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.enableCheckpointing(500, CheckpointingMode.EXACTLY_ONCE);
        env.setStateBackend(new FsStateBackend("hdfs://fk-1:9000/backhand"));

        //kafka prop
        Properties prop = new Properties();
        prop.setProperty("bootstrap.servers", "fk-1:9092");
        prop.setProperty("group.id", "flink_consumer");
        prop.setProperty("auto.offset.reset", "latest");
        prop.setProperty("enable.auto.commit", "true");

        ArrayList<String> topics = new ArrayList<>();
        topics.add("words");

        //flink link to kafka
        DataStreamSource<String> lines = env.addSource(new FlinkKafkaConsumer<>(topics, new SimpleStringSchema(), prop));

        lines.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
                String[] split = s.split(" ");
                Arrays.stream(split).forEach(x -> collector.collect(new Tuple2<>(x, 1)));
            }
        }).keyBy(x -> x.f0).map(new ValueStateRichFunction()).print();
        env.execute();
    }
}

4.3 list-state

  • 统计部门员工

8 pom-xml

<properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <java.version>1.8</java.version>
        <flink.version>1.12.2</flink.version>
        <hadoop.version>3.1.4</hadoop.version>
        <scala.binary.version>2.11</scala.binary.version>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-core</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <!--        parquet文件支持-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-parquet_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <!--从hadoop上读取文件-->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <!--        从kafka读取数据-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <!--flink写入数据到外部系统依赖-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-files</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <!--flink输出结果到redis-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-redis_${scala.binary.version}</artifactId>
            <version>1.1.5</version>
        </dependency>

        <!--        flink输出结果到kafka-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>
        
        <!--        lombok-->
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.12</version>
            <!--            <scope>provided</scope>-->
        </dependency>

    </dependencies>
  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值