Flink DataStream API 介绍

有人看我吗

已于 2024-05-31 09:50:23 修改

阅读量586

点赞数 4

分类专栏： # flink 文章标签： flink 大数据

于 2024-05-20 14:30:11 首次发布

本文链接：https://blog.csdn.net/progammer10086/article/details/139003762

版权

flink 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

一、介绍

官网

DataStream API 得名于特殊的 DataStream 类，该类用于表示 Flink 程序中的数据集合。你可以认为它们是可以包含重复项的不可变数据集合。这些数据可以是有界（有限）的，也可以是无界（无限）的，但用于处理它们的API是相同的。

下面列举的只有部分算子

二、基础算子

2.1、转换算子

2.1.1、Map

Map算子：输入一个元素同时输出一个元素,这里的写法和Java类似,可以使用糖化语法或者实现Function接口

package com.xx.common.study.api.base;

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @author xiaxing
 * @describe Map算子,输入一个元素同时输出一个元素,这里的写法和Java类似,可以使用糖化语法或者实现Function接口
 * @since 2024/5/17 14:27
 */
public class DataStreamMapApiDemo {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<Integer> sourceStream = env.fromElements(1, 2, 3);

        // 糖化语法
        SingleOutputStreamOperator<Integer> multiStream = sourceStream.map(e -> e * 2);
        multiStream.print("数据乘2");

        // 实现Function接口
        SingleOutputStreamOperator<Integer> addStream = sourceStream.map(new MapFunction<Integer, Integer>() {
            @Override
            public Integer map(Integer value) throws Exception {
                return value + 2;
            }
        });

        addStream.print("数据加2");
        env.execute();
    }
}

2.1.2、FlatMap

FlatMap算子：输入一个元素同时产生零个、一个或多个元素

package com.xx.common.study.api.base;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import java.util.Arrays;
import java.util.List;

/**
 * @author xiaxing
 * @describe FlatMap算子,输入一个元素同时产生零个、一个或多个元素
 * @since 2024/5/17 14:27
 */
public class DataStreamFlatMapApiDemo {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<String> sourceStream = env.fromElements("1,2,3");

        // 对source进行加工处理
        sourceStream.flatMap((FlatMapFunction<String, List<String>>) (value, out) -> {
            String[] split = value.split(",");
            out.collect(Arrays.asList(split));
        }).print();

        // 错误写法,和Java写法不用,无法使用这种糖化语法
//        sourceStream.flatMap((k, v) -> {
//            String[] split = k.split(",");
//            v.collect(split);
//        }).print();

        env.execute();
    }
}

2.1.3、Filter

Filter算子：为每个元素执行一个布尔 function，并保留那些 function 输出值为 true 的元素

package com.xx.common.study.api.base;

import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @author xiaxing
 * @describe Filter算子,为每个元素执行一个布尔 function，并保留那些 function 输出值为 true 的元素
 * @since 2024/5/17 14:27
 */
public class DataStreamFilterApiDemo {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<Integer> sourceStream = env.fromElements(1, 2, 3);
        // 保留整数
        sourceStream.filter(e -> (e % 2) == 0).print("糖化语法保留整数");

        sourceStream.filter(new FilterFunction<Integer>() {
            @Override
            public boolean filter(Integer value) throws Exception {
                return value % 2 == 0;
            }
        }).print("实现Function保留整数");

        env.execute();
    }
}

2.2、聚合算子

2.2.1、KeyBy

KeyBy算子：在逻辑上将流划分为不相交的分区。具有相同 key 的记录都分配到同一个分区。

对于Flink而言，DataStream是没有直接进行聚合的API的。因为我们对海量数据做聚合肯定要进行分区并行处理，这样才能提高效率

在内部， keyBy() 是通过哈希分区实现的

package com.xx.common.study.api.base;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @author xiaxing
 * @describe KeyBy算子,在逻辑上将流划分为不相交的分区。具有相同 key 的记录都分配到同一个分区。在内部， keyBy() 是通过哈希分区实现的
 * @since 2024/5/17 14:27
 */
public class DataStreamKeyByApiDemo {

    @Data
    @AllArgsConstructor
    @NoArgsConstructor
    public static class keyByDemo {
        private Integer id;
        private Integer count;
    }

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStreamSource<keyByDemo> sourceStream = env.fromElements(
                new keyByDemo(1, 1),
                new keyByDemo(2, 2),
                new keyByDemo(3, 3),
                new keyByDemo(1, 4)
        );
        KeyedStream<keyByDemo, Integer> keyByStream = sourceStream.keyBy(keyByDemo::getId);
        keyByStream.print("按照key分组");

        // 使用key分组之后可以使用一些常用的聚合算子
        // positionToSum:可以用于Tuple类型数据传递索引位置,field:传递字段名称
        keyByStream.sum("count").print();
        env.execute();
    }
}

2.2.2、sum/min/max/minBy/maxBy

按键分区的数据流KeyedStream，Flink提供了一些基础的聚合操作

package com.xx.common.study.api.base;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @author xiaxing
 * @describe keyBy之后简单聚合
 * @since 2024/5/17 14:27
 */
public class DataStreamKeyByAggregationApiDemo {

    @Data
    @AllArgsConstructor
    @NoArgsConstructor
    public static class keyByAggregationDemo {
        private Integer id;
        private Integer count;
    }

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStreamSource<keyByAggregationDemo> sourceStream = env.fromElements(
                new keyByAggregationDemo(1, 1),
                new keyByAggregationDemo(2, 2),
                new keyByAggregationDemo(3, 3),
                new keyByAggregationDemo(1, 4)
        );
        KeyedStream<keyByAggregationDemo, Integer> keyByStream = sourceStream.keyBy(keyByAggregationDemo::getId);

        // 使用key分组之后可以使用一些常用的聚合算子
        // 求和
        keyByStream.sum("count").print("sum");
        // 求最小值
        keyByStream.min("count").print("min");
        // 求最大值
        keyByStream.max("count").print("max");
        // 求最小值,和min类似,但是会保留数据的其他属性
        keyByStream.minBy("count").print("minBy");
        // 求最大值,和max类似,但是会保留数据的其他属性
        keyByStream.maxBy("count").print("maxBy");
        env.execute();
    }
}

2.2.3、Reduce

Reduce算子：在相同 key 的数据流上“滚动”执行 reduce。将当前元素与最后一次 reduce 得到的值组合然后输出新值

package com.xx.common.study.api.base;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @author xiaxing
 * @describe Reduce算子,在相同 key 的数据流上“滚动”执行 reduce。将当前元素与最后一次 reduce 得到的值组合然后输出新值
 * @since 2024/5/17 14:27
 */
public class DataStreamReduceApiDemo {

    @Data
    @AllArgsConstructor
    @NoArgsConstructor
    public static class reduceByDemo {
        private String id;
        private Integer count;
    }

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<reduceByDemo> sourceStream = env.fromElements(
                new reduceByDemo("1", 1),
                new reduceByDemo("2", 2),
                new reduceByDemo("3", 3),
                new reduceByDemo("1", 4)
        );

        sourceStream.keyBy(reduceByDemo::getId).reduce(new ReduceFunction<reduceByDemo>() {
            @Override
            public reduceByDemo reduce(reduceByDemo value1, reduceByDemo value2) throws Exception {
                value1.setCount(value1.getCount() + value2.getCount());
                return value1;
            }
        }).print();
        env.execute();
    }
}

三、窗口

官网地址

3.1、概念

窗口（Window）是处理无界流的关键所在。窗口可以将数据流装入大小有限的“桶”中，再对每个“桶”加以处理。

Flink 中的时间有三种类型：

Event Time：是事件创建的时间。它通常由事件中的时间戳描述，例如采集的日志数据中，每一条日志都会记录自己的生成时间，Flink 通过时间戳分配器访问事件时间戳。
Ingestion Time：是数据进入 Flink 的时间。
Processing Time：是每一个执行基于时间操作的算子的本地系统时间，与机器相关，默认的时间属性就是 Processing Time。

3.2、语法

Keyed Windows

stream
       .keyBy(...)               <-  仅 keyed 窗口需要
       .window(...)              <-  必填项："assigner"
      [.trigger(...)]            <-  可选项："trigger" (省略则使用默认 trigger)
      [.evictor(...)]            <-  可选项："evictor" (省略则不使用 evictor)
      [.allowedLateness(...)]    <-  可选项："lateness" (省略则为 0)
      [.sideOutputLateData(...)] <-  可选项："output tag" (省略则不对迟到数据使用 side output)
       .reduce/aggregate/apply()      <-  必填项："function"
      [.getSideOutput(...)]      <-  可选项："output tag"

Non-Keyed Windows

stream
       .windowAll(...)           <-  必填项："assigner"
      [.trigger(...)]            <-  可选项："trigger" (else default trigger)
      [.evictor(...)]            <-  可选项："evictor" (else no evictor)
      [.allowedLateness(...)]    <-  可选项："lateness" (else zero)
      [.sideOutputLateData(...)] <-  可选项："output tag" (else no side output for late data)
       .reduce/aggregate/apply()      <-  必填项："function"
      [.getSideOutput(...)]      <-  可选项："output tag"

3.3、Window Assigners

Window Assigners为抽象类，Flink默认已经实现了4种窗口

3.3.1、滚动窗口（Tumbling Windows）

滚动窗口的 assigner 分发元素到指定大小的窗口。滚动窗口的大小是固定的，且各自范围之间不重叠。比如说，如果你指定了滚动窗口的大小为 5 分钟，那么每 5 分钟就会有一个窗口被计算，且一个新的窗口被创建（如下图所示）。

在这里插入图片描述

DataStream<T> input = ...;

// 滚动 event-time 窗口
input
    .keyBy(<key selector>)
    .window(TumblingEventTimeWindows.of(Time.seconds(5)))
    .<windowed transformation>(<window function>);

// 滚动 processing-time 窗口
input
    .keyBy(<key selector>)
    .window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
    .<windowed transformation>(<window function>);

// 长度为一天的滚动 event-time 窗口， 偏移量为 -8 小时。
input
    .keyBy(<key selector>)
    .window(TumblingEventTimeWindows.of(Time.days(1), Time.hours(-8)))
    .<windowed transformation>(<window function>);

3.3.2、滑动窗口（Sliding Windows）

与滚动窗口类似，滑动窗口的 assigner 分发元素到指定大小的窗口，窗口大小通过 window size 参数设置。滑动窗口需要一个额外的滑动距离（window slide）参数来控制生成新窗口的频率。因此，如果 slide 小于窗口大小，滑动窗口可以允许窗口重叠。这种情况下，一个元素可能会被分发到多个窗口。

比如说，你设置了大小为 10 分钟，滑动距离 5 分钟的窗口，你会在每 5 分钟得到一个新的窗口，里面包含之前 10 分钟到达的数据（如下图所示）。

在这里插入图片描述

DataStream<T> input = ...;

// 滑动 event-time 窗口
input
    .keyBy(<key selector>)
    .window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
    .<windowed transformation>(<window function>);

// 滑动 processing-time 窗口
input
    .keyBy(<key selector>)
    .window(SlidingProcessingTimeWindows.of(Time.seconds(10), Time.seconds(5)))
    .<windowed transformation>(<window function>);

// 滑动 processing-time 窗口，偏移量为 -8 小时
input
    .keyBy(<key selector>)
    .window(SlidingProcessingTimeWindows.of(Time.hours(12), Time.hours(1), Time.hours(-8)))
    .<windowed transformation>(<window function>);

3.3.3、会话窗口（Session Windows）

会话窗口的 assigner 会把数据按活跃的会话分组。与滚动窗口和滑动窗口不同，会话窗口不会相互重叠，且没有固定的开始或结束时间。会话窗口在一段时间没有收到数据之后会关闭，即在一段不活跃的间隔之后。会话窗口的 assigner 可以设置固定的会话间隔（session gap）或用 session gap extractor 函数来动态地定义多长时间算作不活跃。当超出了不活跃的时间段，当前的会话就会关闭，并且将接下来的数据分发到新的会话窗口。

在这里插入图片描述

DataStream<T> input = ...;

// 设置了固定间隔的 event-time 会话窗口
input
    .keyBy(<key selector>)
    .window(EventTimeSessionWindows.withGap(Time.minutes(10)))
    .<windowed transformation>(<window function>);
    
// 设置了动态间隔的 event-time 会话窗口
input
    .keyBy(<key selector>)
    .window(EventTimeSessionWindows.withDynamicGap((element) -> {
        // 决定并返回会话间隔
    }))
    .<windowed transformation>(<window function>);

// 设置了固定间隔的 processing-time session 窗口
input
    .keyBy(<key selector>)
    .window(ProcessingTimeSessionWindows.withGap(Time.minutes(10)))
    .<windowed transformation>(<window function>);
    
// 设置了动态间隔的 processing-time 会话窗口
input
    .keyBy(<key selector>)
    .window(ProcessingTimeSessionWindows.withDynamicGap((element) -> {
        // 决定并返回会话间隔
    }))
    .<windowed transformation>(<window function>);

3.3.4、全局窗口（Global Windows）

全局窗口的 assigner 将拥有相同 key 的所有数据分发到一个全局窗口。这样的窗口模式仅在你指定了自定义的 trigger 时有用。否则，计算不会发生，因为全局窗口没有天然的终点去触发其中积累的数据。

在这里插入图片描述

DataStream<T> input = ...;

input
    .keyBy(<key selector>)
    .window(GlobalWindows.create())
    .<windowed transformation>(<window function>);

3.4、窗口函数

定义了 window assigner 之后，我们需要指定当窗口触发之后，我们如何计算每个窗口中的数据

窗口函数有三种：ReduceFunction、AggregateFunction 或 ProcessWindowFunction。
前两者执行起来更高效，因为 Flink 可以在每条数据到达窗口后进行增量聚合（incrementally aggregate）。而 ProcessWindowFunction 会得到能够遍历当前窗口内所有数据的 Iterable，以及关于这个窗口的 meta-information

3.4.1、ReduceFunction

ReduceFunction 指定两条输入数据如何合并起来产生一条输出数据，输入和输出数据的类型必须相同

package com.xx.common.study.api.windows;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;

/**
 * @author xiaxing
 * @describe 窗口函数-reduce
 * @since 2024/5/17 14:27
 */
public class DataStreamWindowsReduceApiDemo {

    @Data
    @NoArgsConstructor
    @AllArgsConstructor
    public static class TumblingWindows {
        private Integer id;
        private Integer count;
    }

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<String> socketStream = env.socketTextStream("localhost", 7777);
        socketStream
                .map(new MapFunction<String, TumblingWindows>() {
                    @Override
                    public TumblingWindows map(String value) throws Exception {
                        String[] split = value.split(",");
                        return new TumblingWindows(Integer.valueOf(split[0]), Integer.valueOf(split[1]));
                    }
                })
                .keyBy(new KeySelector<TumblingWindows, Integer>() {
                    @Override
                    public Integer getKey(TumblingWindows value) throws Exception {
                        return value.getId();
                    }
                })
                .window(TumblingProcessingTimeWindows.of(Time.seconds(5L)))
                .reduce(new ReduceFunction<TumblingWindows>() {
                    @Override
                    public TumblingWindows reduce(TumblingWindows value1, TumblingWindows value2) throws Exception {
                        return new TumblingWindows(value1.getId(), value1.getCount() + value2.getCount());
                    }
                })
                .print();

        env.execute();
    }
}

3.4.2、ReduceFunction糖化语法

使用Lambda糖化语法对代码进行了简化

package com.xx.common.study.api.windows;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;

/**
 * @author xiaxing
 * @describe 窗口函数-reduce-糖化语法
 * @since 2024/5/17 14:27
 */
public class DataStreamWindowsReduceLambdaApiDemo {

    @Data
    @NoArgsConstructor
    @AllArgsConstructor
    public static class TumblingWindows {
        private Integer id;
        private Integer count;
    }

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<String> socketStream = env.socketTextStream("localhost", 7777);

        socketStream
                .map(value -> {
                    String[] split = value.split(",");
                    return new TumblingWindows(Integer.valueOf(split[0]), Integer.valueOf(split[1]));
                })
                .keyBy(TumblingWindows::getId)
                .window(TumblingProcessingTimeWindows.of(Time.seconds(5L)))
                .reduce((value1, value2) -> new TumblingWindows(value1.getId(), value1.getCount() + value2.getCount()))
                .print();

        env.execute();
    }
}

3.4.3、AggregateFunction

ReduceFunction 是 AggregateFunction 的特殊情况。 AggregateFunction 接收三个类型：输入数据的类型(IN)、累加器的类型（ACC）和输出数据的类型（OUT）。输入数据的类型是输入流的元素类型，AggregateFunction 接口有如下几个方法：把每一条元素加进累加器、创建初始累加器、合并两个累加器、从累加器中提取输出（OUT 类型）

package com.xx.common.study.api.windows;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;

import java.util.Optional;

/**
 * @author xiaxing
 * @describe 窗口函数-Aggregate
 * @since 2024/5/17 14:27
 */
public class DataStreamWindowsAggregateApiDemo {

    @Data
    @NoArgsConstructor
    @AllArgsConstructor
    public static class TumblingWindows {
        private Integer id;
        private Integer count;
    }

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<String> socketStream = env.socketTextStream("localhost", 7777);

        // 求和
        AggregateFunction<TumblingWindows, TumblingWindows, TumblingWindows> aggregateFunction = new AggregateFunction<TumblingWindows, TumblingWindows, TumblingWindows>() {
            @Override
            public TumblingWindows createAccumulator() {
                // 创建累加器,并将其初始化为默认值
                return new TumblingWindows();
            }

            @Override
            public TumblingWindows add(TumblingWindows value, TumblingWindows accumulator) {
                // 将输入的元素添加到累加器,返回更新后的累加器
                Integer count1 = Optional.of(value.getCount()).orElse(0);
                Integer count2 = Optional.ofNullable(accumulator.getCount()).orElse(0);
                return new TumblingWindows(value.getId(), count1 + count2);
            }

            @Override
            public TumblingWindows getResult(TumblingWindows accumulator) {
                // 从累加器中提取操作的结果
                return accumulator;
            }

            @Override
            public TumblingWindows merge(TumblingWindows a, TumblingWindows b) {
                // 将两个累加器合并为一个新的累加器
                return new TumblingWindows(a.getId(), a.getCount() + b.getCount());
            }
        };

        socketStream
                .map(value -> {
                    String[] split = value.split(",");
                    return new TumblingWindows(Integer.valueOf(split[0]), Integer.valueOf(split[1]));
                })
                .keyBy(TumblingWindows::getId)
                .window(TumblingProcessingTimeWindows.of(Time.seconds(5L)))
                .aggregate(aggregateFunction)
                .print();

        env.execute();
    }
}

3.4.4、ProcessWindowFunction

ProcessWindowFunction 可以与 ReduceFunction 或 AggregateFunction 搭配使用，使其能够在数据到达窗口的时候进行增量聚合。当窗口关闭时，ProcessWindowFunction 将会得到聚合的结果。这样它就可以增量聚合窗口的元素并且从 ProcessWindowFunction` 中获得窗口的元数据。

package com.xx.common.study.api.windows;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;

/**
 * @author xiaxing
 * @describe 窗口函数-reduce-process
 * @since 2024/5/17 14:27
 */
public class DataStreamWindowsReduceProcessApiDemo {

    @Data
    @NoArgsConstructor
    @AllArgsConstructor
    public static class TumblingWindows {
        private Integer id;
        private Integer count;
    }

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<String> socketStream = env.socketTextStream("localhost", 7777);

        socketStream
                .map(value -> {
                    String[] split = value.split(",");
                    return new TumblingWindows(Integer.valueOf(split[0]), Integer.valueOf(split[1]));
                })
                .keyBy(TumblingWindows::getId)
                .window(TumblingProcessingTimeWindows.of(Time.seconds(5L)))
                .reduce(new MyReduceFunction(), new MyProcessWindowsFunction())
                .print();

        env.execute();
    }

    private static class MyReduceFunction implements ReduceFunction<TumblingWindows> {
        @Override
        public TumblingWindows reduce(TumblingWindows value1, TumblingWindows value2) throws Exception {
            return new TumblingWindows(value1.getId(), value1.getCount() + value2.getCount());
        }
    }

    private static class MyProcessWindowsFunction extends ProcessWindowFunction<TumblingWindows, TumblingWindows, Integer, TimeWindow> {
        @Override
        public void process(Integer integer, ProcessWindowFunction<TumblingWindows, TumblingWindows, Integer, TimeWindow>.Context context, Iterable<TumblingWindows> elements, Collector<TumblingWindows> out) throws Exception {
            elements.forEach(e -> {
                Integer count = e.getCount();
                // 当count > 10时才数据元素
                if (count > 10) {
                    out.collect(e);
                }
            });
        }
    }
}

四、Join

合并流

4.1 Union

将两个或多个数据流联合来创建一个包含所有流中数据的新流。注意：如果一个数据流和自身进行联合，这个流中的每个数据将在合并后的流中出现两次

package com.xx.common.study.api.union;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @author xiaxing
 * @describe Union算子,将两个或多个数据流联合来创建一个包含所有流中数据的新流。注意：如果一个数据流和自身进行联合，这个流中的每个数据将在合并后的流中出现两次
 *              1.流的数据类型需要一致
 *              2.一次可以合并多条流
 * @since 2024/5/14 14:47
 */
public class UnionDemo {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        DataStreamSource<Integer> source1 = env.fromElements(1, 2, 3);
        DataStreamSource<Integer> source2 = env.fromElements(11, 22, 33);
        DataStreamSource<String> source3 = env.fromElements("111", "222", "333");

        DataStream<Integer> union = source1.union(source2).union(source3.map(Integer::valueOf));
        // 另外一种写法
//        DataStream<Integer> union = source1.union(source2, source3.map(Integer::valueOf));

        union.print();
        env.execute();
    }
}

4.2 Connect

“连接” 两个数据流并保留各自的类型。connect 允许在两个流的处理逻辑之间共享状态。

package com.xx.common.study.api.connect;

import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;

/**
 * @author xiaxing
 * @describe Connect算子,“连接” 两个数据流并保留各自的类型。connect 允许在两个流的处理逻辑之间共享状态
 *              1.流的数据类型可以不一致
 *              2.一次可以合并两条流
 * @since 2024/5/14 14:53
 */
public class ConnectDemo {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        DataStreamSource<Integer> source1 = env.fromElements(1, 2, 3);
        DataStreamSource<String> source3 = env.fromElements("a", "b", "c");

        ConnectedStreams<Integer, String> connect = source1.connect(source3);

        SingleOutputStreamOperator<String> coStream = connect.map(new CoMapFunction<Integer, String, String>() {
            @Override
            public String map1(Integer value) throws Exception {
                return String.valueOf(value);
            }

            @Override
            public String map2(String value) throws Exception {
                return value;
            }
        });
        coStream.print();
        env.execute();
    }
}

4.3 Window Join

根据指定的 key 和窗口 join 两个数据流

package com.xx.common.study.api.connect;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;

/**
 * @author xiaxing
 * @describe Window Join算子
 * @since 2024/5/17 14:27
 */
public class WindowsJoinDemo {

    @Data
    @NoArgsConstructor
    @AllArgsConstructor
    private static class WindowsJoin {
        private Integer id;
        private Integer count;
    }

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        env.socketTextStream("localhost", 7777);

        DataStreamSource<WindowsJoin> source1 = env.fromElements(
                new WindowsJoin(1, 1),
                new WindowsJoin(2, 10),
                new WindowsJoin(3, 100)
        );
        DataStreamSource<WindowsJoin> source2 = env.fromElements(
                new WindowsJoin(1, 1),
                new WindowsJoin(10, 10),
                new WindowsJoin(20, 100)
        );

        DataStream<WindowsJoin> apply = source1.join(source2)
                .where(WindowsJoin::getId)
                .equalTo(WindowsJoin::getId)
                .window(TumblingProcessingTimeWindows.of(Time.seconds(5L)))
                .apply((first, second) -> new WindowsJoin(first.getId(), first.getCount() + second.getCount()));


        apply.print();
        env.execute();
    }
}

五、旁路输出

分流

package com.xx.common.study.api.output;

import com.xx.common.study.domain.WaterSensor;
import com.xx.common.study.function.WaterSensorMapFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.streaming.api.datastream.SideOutputDataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;

import java.util.Objects;

/**
 * @author xiaxing
 * @describe 测输出流,使用场景:
 *              1.输出一些异常数据的告警信息
 *              2.将不通类型的数据分开处理
 * @since 2024/5/14 14:13
 */
public class SideOutputDemo {

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        SingleOutputStreamOperator<WaterSensor> socketStream = env.socketTextStream("127.0.0.1", 7777).map(new WaterSensorMapFunction());

        /*
         * 创建测输出流
         * id:测输出流的名称
         * typeInfo:侧输出流的数据类型
         */
        OutputTag<WaterSensor> s1Tag = new OutputTag<>("s1", TypeInformation.of(WaterSensor.class));
        OutputTag<WaterSensor> s2Tag = new OutputTag<>("s2", TypeInformation.of(WaterSensor.class));

        // 将水位线s1,s2的数据分开
        SingleOutputStreamOperator<WaterSensor> processStream = socketStream.process(new ProcessFunction<WaterSensor, WaterSensor>() {
            @Override
            public void processElement(WaterSensor value, ProcessFunction<WaterSensor, WaterSensor>.Context ctx, Collector<WaterSensor> out) {
                String id = value.getId();
                // 如果id是s1,则将s1数据放到侧输出流
                if (Objects.equals(id, "s1")) {
                    /*
                     * 将数据写到测流
                     * outputTag:测输出流
                     * value:写到侧输出流的数据
                     */
                    ctx.output(s1Tag, value);
                } else if (Objects.equals(id, "s2")) {
                    // 如果id是s2,则将s2数据放到侧输出流
                    ctx.output(s2Tag, value);
                } else {
                    // 其他数据放到主流中
                    out.collect(value);
                }
            }
        });

        // 打印主流数据
        processStream.print("主流");

        // 从主流中根据标签获取测输出流
        SideOutputDataStream<WaterSensor> s1TagStream = processStream.getSideOutput(s1Tag);
        SideOutputDataStream<WaterSensor> s2TagStream = processStream.getSideOutput(s2Tag);
        s1TagStream.print("s1");
        s2TagStream.print("s2");
        env.execute();
    }
}

六、Process Function

官网地址

这里将process算子单独拿出来，process算子非常灵活，可以手动实现上面基础算子，也可以做处理一些更加复杂的业务场景，并且还具备状态管理等额外功能。

Process Function可以在keyBy、windows、allWindows、connect等绝大部分算子执行执行

package com.xx.common.study.api.process;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;

/**
 * @author xiaxing
 * @describe
 * @since 2024/5/30 17:14
 */
public class ProcessFunctionDemo {

    @Data
    @AllArgsConstructor
    @NoArgsConstructor
    public static class ProcessDemo {
        private Integer id;
        private Integer count;
    }

    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<ProcessDemo> sourceStream = env.fromElements(
                new ProcessFunctionDemo.ProcessDemo(1, 1),
                new ProcessFunctionDemo.ProcessDemo(2, 2),
                new ProcessFunctionDemo.ProcessDemo(3, 3),
                new ProcessFunctionDemo.ProcessDemo(1, 4)
        );

        // 实现map效果
        sourceStream.process(new ProcessFunction<ProcessDemo, ProcessDemo>() {
            @Override
            public void processElement(ProcessDemo value, ProcessFunction<ProcessDemo, ProcessDemo>.Context ctx, Collector<ProcessDemo> out) throws Exception {
                value.setCount(value.getCount() * 2);
                out.collect(value);
            }
        }).print("map");

        // 实现filter效果
        sourceStream.process(new ProcessFunction<ProcessDemo, ProcessDemo>() {
            @Override
            public void processElement(ProcessDemo value, ProcessFunction<ProcessDemo, ProcessDemo>.Context ctx, Collector<ProcessDemo> out) throws Exception {
                if (value.getCount() % 2 == 0) {
                    out.collect(value);
                }
            }
        }).print("filter");
        env.execute();
    }
}