【Flink】Transformation入门

0 项目依赖

    <properties>
        <java.version>1.8</java.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.12</artifactId>
            <version>1.12.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.12.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.12</artifactId>
            <version>1.12.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-jdbc_2.12</artifactId>
            <version>1.12.0</version>
        </dependency>
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.47</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.5.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>

1 基本操作

  • flatMap
  • filter
  • map
  • keyBy
  • sum
  • reduce
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

public class TransformationExamples {
    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);

        DataStream<String> linesDS = env.fromElements("hadoop,hdfs,flink,spark",
                "hadoop,hdfs,flink",
                "hadoop,hdfs",
                "fu,you",
                "you,fu",
                "hadoop,spark");

        // 处理数据
        // flatMap 每行按照逗号分割组成集合
        DataStream<String> wordsDS = linesDS.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public void flatMap(String value, Collector<String> out) throws Exception {
                // value是一行单词
                String[] words = value.split(",");
                for (String word : words) {
                    // 收集每个word
                    out.collect(word);
                }
            }
        });

        // 过滤掉 fu
        // filter
        DataStream<String> filtered_wordsDS = wordsDS.filter(new FilterFunction<String>() {
            @Override
            public boolean filter(String s) throws Exception {
                if (s.equals("fu")){
                    System.out.println("检测到“fu”!");
                    return false;
                }
                return true;
            }
        });

        // 将每个单词记为1,并组成元组返回
        // map
        DataStream<Tuple2<String, Integer>> word_1 = filtered_wordsDS.map(new MapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> map(String value) throws Exception {
                return Tuple2.of(value, 1);
            }
        });

        // 对数据按照单词分组
        // keyBy
        KeyedStream<Tuple2<String, Integer>, String> word_1_group = word_1.keyBy(tuple2 -> tuple2.f0);

        // 把分组的数据按照索引聚合
        // sum 方式
        //DataStream<Tuple2<String, Integer>> result = word_1_group.sum(1);

        // reduce 方式
        DataStream<Tuple2<String, Integer>> result = word_1_group.reduce(new ReduceFunction<Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> reduce(Tuple2<String, Integer> t1, Tuple2<String, Integer> t2) throws Exception {
                return Tuple2.of(t1.f0, t1.f1 + t2.f1);
            }
        });

        result.print();

        env.execute();
    }
}

2 合并

  • union : 合并
  • connect : 连接
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.ConnectedStreams;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoMapFunction;
import org.apache.flink.util.Collector;

public class TransformationExamples2 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);

        DataStream<String> d1 = env.fromElements("hadoop,spark,hdfs","hdfs,spark","spark");
        DataStream<String> d2 = env.fromElements("java,python,c++","java,python","java");
        DataStream<Long> d3 = env.fromElements(100L,110L,120L);

        // 逗号分割
        FlatMapFunction<String, String> flat = new FlatMapFunction<String, String>() {
            @Override
            public void flatMap(String s, Collector<String> collector) throws Exception {
                String[] words = s.split(",");
                for (String word : words) {
                    collector.collect(word);
                }
            }
        };

        d1 = d1.flatMap(flat);
        d2 = d2.flatMap(flat);

        // union 类型必须一致
        DataStream<String> d1_d2 = d1.union(d2);
        d1_d2 = d1_d2.map(new MapFunction<String, String>() {
            @Override
            public String map(String s) throws Exception {
                return "d1_d2: " + s;
            }
        });

        // connect 可以连接不同类型
        ConnectedStreams<String, Long> connect = d1.connect(d3);
        // 连接后需要进行下一步处理
        DataStream<String> d1_d3 = connect.map(new CoMapFunction<String, Long, String>() {
            @Override
            public String map1(String s) throws Exception {
                return "d1_d3: String: " + s;
            }

            @Override
            public String map2(Long aLong) throws Exception {
                return "d1_d3: Long: " + aLong;
            }
        });

        d1_d2.print();
        d1_d3.print();

        env.execute();

    }


}

输出:
输出

3 拆分

  • split : 分流
  • select : 获取分流后的数据
  • side outputs : 使用 process 方法对流中的数据进行处理,使用 outputTag 来记录处理结果
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;

public class TransformationExamples3 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);

        DataStream<Integer> ds = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);

        // 定义单数输出标签
        OutputTag<Integer> odd = new OutputTag<>("odd", TypeInformation.of(Integer.class));

        // 取出单数
        SingleOutputStreamOperator<Integer> process = ds.process(new ProcessFunction<Integer, Integer>() {
            @Override
            public void processElement(Integer integer, Context context, Collector<Integer> collector) throws Exception {
                if (integer % 2 == 1) {
                    context.output(odd, integer);
                }
            }
        });

        // 按照标签取出
        DataStream<Integer> oddNums = process.getSideOutput(odd);

        oddNums.print();

        env.execute();
    }
}

4 分区

4.1 reblance 重平衡分区

防止一个分区处理过多数据从而延长运行时间

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class TransformationExamples4 {
    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);

        DataStream<Long> longDS = env.fromSequence(0, 100);

        // 随机分配一下,这个过程有可能出现数据倾斜
        DataStream<Long> filterDS = longDS.filter(new FilterFunction<Long>() {
            @Override
            public boolean filter(Long num) throws Exception {
                return num > 10;
            }
        });

        // 定义 mapFunction 和 keySelector ,也可以用lambda写 
        
        RichMapFunction<Long,Tuple2<Integer, Integer>> mapFunction = new RichMapFunction<Long, Tuple2<Integer, Integer>>() {
            @Override
            public Tuple2<Integer, Integer> map(Long aLong) throws Exception {
                int id = getRuntimeContext().getIndexOfThisSubtask();
                return Tuple2.of(id, 1);
            }
        };

        KeySelector<Tuple2<Integer, Integer>, Integer> keySelector = new KeySelector<Tuple2<Integer, Integer>, Integer>() {
            @Override
            public Integer getKey(Tuple2<Integer, Integer> integerIntegerTuple2) throws Exception {
                return integerIntegerTuple2.f0;
            }
        };

        SingleOutputStreamOperator<Tuple2<Integer, Integer>> result1 =
                filterDS.map(mapFunction)
                        .keyBy(keySelector)
                        .sum(1);
        
        SingleOutputStreamOperator<Tuple2<Integer, Integer>> result2 =
                filterDS.rebalance()
                        .map(mapFunction)
                        .keyBy(keySelector)
                        .sum(1);

        
        result1.print("result1");//有可能出现数据倾斜
        result2.print("result2");//解决了数据倾斜

        
        env.execute();
    }
}

输出如下,格式为(分区号,处理的数据数量),其中 result1 未平衡,result2 平衡:
输出

  • 未进行平衡分区的 result1 中,2号分区处理的数据较少,发生了数据倾斜
  • 进行了重平衡分区的 result2 中,各个分区的处理的数据量大致相同,没有很大波动。

4.2 其他分区

分区类型说明
dataStream.global()全部发往第一个task
dataStream.broadcast()广播
dataStream.forward()上下游并发度相同时一对一发送
dataStream.shuffle()随机均匀分配
dataStream.rebalance()Round-Robin(重平衡)
dataStream.partitionCustom()自定义
package Transformation;

import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.Partitioner;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class TransformationExamples5 {
    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC).setParallelism(3);

        DataStream<Long> longDS = env.fromSequence(0, 100);

        // 随机分配一下,这个过程有可能出现数据倾斜
        DataStream<Long> filterDS = longDS.filter(new FilterFunction<Long>() {
            @Override
            public boolean filter(Long num) throws Exception {
                return num > 10;
            }
        });

        // 定义 mapFunction 和 keySelector ,也可以用lambda写

        RichMapFunction<Long,Tuple2<Integer, Integer>> mapFunction = new RichMapFunction<Long, Tuple2<Integer, Integer>>() {
            @Override
            public Tuple2<Integer, Integer> map(Long aLong) throws Exception {
                int id = getRuntimeContext().getIndexOfThisSubtask();
                return Tuple2.of(id, 1);
            }
        };

        KeySelector<Tuple2<Integer, Integer>, Integer> keySelector = new KeySelector<Tuple2<Integer, Integer>, Integer>() {
            @Override
            public Integer getKey(Tuple2<Integer, Integer> integerIntegerTuple2) throws Exception {
                return integerIntegerTuple2.f0;
            }
        };

        DataStream<Tuple2<Integer, Integer>> result1 =
                filterDS.map(mapFunction)
                        .keyBy(keySelector)
                        .sum(1);

        DataStream<Tuple2<Integer, Integer>> result2 =
                filterDS.global()
                        .map(mapFunction)
                        .keyBy(keySelector)
                        .sum(1);

        DataStream<Tuple2<Integer, Integer>> result3 =
                filterDS.broadcast()
                        .map(mapFunction)
                        .keyBy(keySelector)
                        .sum(1);

        DataStream<Tuple2<Integer, Integer>> result4 =
                filterDS.forward()
                        .map(mapFunction)
                        .keyBy(keySelector)
                        .sum(1);

        DataStream<Tuple2<Integer, Integer>> result5 =
                filterDS.shuffle()
                        .map(mapFunction)
                        .keyBy(keySelector)
                        .sum(1);

        DataStream<Tuple2<Integer, Integer>> result6 =
                filterDS.rebalance()
                        .map(mapFunction)
                        .keyBy(keySelector)
                        .sum(1);

        DataStream<Tuple2<Integer, Integer>> result7 =
                filterDS.partitionCustom(new Partitioner<Long>() {
                    @Override
                    public int partition(Long aLong, int i) {
                        return aLong > 50 ? 0 : 1;
                    }
                }, new KeySelector<Long, Long>() {
                    @Override
                    public Long getKey(Long aLong) throws Exception {
                        return aLong;
                    }
                })
                        .map(mapFunction)
                        .keyBy(keySelector)
                        .sum(1);


        result1.print("result1:默认");
        result2.print("result2:全部发到第一个task");
        result3.print("result3:广播");
        result4.print("result4:上下游并发度相同时一对一发送");
        result5.print("result5:随机均匀分配");
        result6.print("result6:Round-Robin");
        result7.print("result7:自定义分配");


        env.execute();
    }
}

输出如下:
输出

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

搬金砖的小白

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值