《动手学Flink》——Flink Data Transformation (转换算子)

程序与数据流

在这里插入图片描述
所有的Flink 程序都是由三部分组成:Source,TransformationSink
Source: 负责读取数据源数据
Transformation:利用各种算子进行处理加工
Sink:负责输出。

Map 算子:对DataStream进行操作,返回一个新的DataStream

将DataStream类型 转化为 DataStream类型。

package com.lei.apitest.c02_transformation;

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @description:  对DataStream进行操作,返回一个新的DataStream
 * @author: hadwinling
 * @time: 2021/3/29 下午3:53
 */
public class C01_Map_TransformationDemo1 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStreamSource<Integer> nums = env.fromElements(1, 2, 3, 4, 5);

        // 方式一:使用匿名内部类
        // map方法是一个Transformation,功能:做映射。按tab键,会自动补全
        /*SingleOutputStreamOperator<Integer> res = nums.map(new MapFunction<Integer, Integer>() {
            @Override
            public Integer map(Integer integer) throws Exception {
                return integer * 2;
            }
        });*/

        // 方式二:使用lambda表达式
        //SingleOutputStreamOperator<Integer> res = nums.map(i -> i * 2).returns(Integer.class);
        SingleOutputStreamOperator<Integer> res = nums.map(i -> i * 2).returns(Types.INT);

        // 方式三:传入功能更加强大的RichMapFunction
        // 使用RichXXX_Function,里面含有open,close方法,比如后续读取数据库的前后操作就可以使用open,close
        SingleOutputStreamOperator<Integer> map = nums.map(new RichMapFunction<Integer, Integer>() {
            // open,在构造方法之后,map方法执行之前,执行一次,Configuration可以拿到全局配置
            // 用来初始化一下连接,或者初始化或恢复state
            @Override
            public void open(Configuration parameters) throws Exception {
                super.open(parameters);
            }

            // 销毁之前,执行一次,通常是做资源释放
            @Override
            public void close() throws Exception {
                super.close();
            }

            @Override
            public Integer map(Integer integer) throws Exception {
                return integer * 10;
            }

            // close
        });

        // Sink
        res.print();
        System.out.println("==================================");
        map.print();

        env.execute();
    }
}

FlatMap算子:取一个元素并产生零个,一个或多个元素。

将DataStream类型 转化为 DataStream类型。

package com.lei.apitest.c02_transformation;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

import java.util.Arrays;

/**
 * @description:
 * @author: hadwinling
 * @time: 2021/3/29 下午4:01
 */
public class C02_FlatMap_TransformationDemo1 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStreamSource<String> lines = env.fromElements("spark flink hadoop", "spark flink hbase");

        SingleOutputStreamOperator<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public void flatMap(String line, Collector<String> collector) throws Exception {
                /*String[] words = line.split(" ");
                for (String word : words) {
                    collector.collect(word);
                }*/

                //Arrays.asList(line.split(" ")).forEach(w -> collector.collect(w));
                Arrays.stream(line.split(" ")).forEach(collector::collect); // 推荐使用这种方式编写代码,简洁
            }
        });

        SingleOutputStreamOperator<String> words2 = lines.flatMap((String line, Collector<String> out) ->
                Arrays.stream(line.split(" ")).forEach(out::collect)).returns(Types.STRING);

        // flatMap方法还可以传入RichFlatMapFunction

        // Sink
        words2.print();

        env.execute();
    }
}


Filter 算子:为每个元素评估一个布尔函数,并保留该函数返回true的布尔函数。

将DataStream类型 转化为 DataStream类型。

package com.lei.apitest.c02_transformation;

import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

import java.util.Arrays;

/**
 * @description:
 * @author: hadwinling
 * @time: 2021/3/29 下午4:01
 */
public class C03_Filter_TransformationDemo1 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStreamSource<Integer> nums = env.fromElements(1,2,3,4,5,6,7,8,9);

        SingleOutputStreamOperator<Integer> odd = nums.filter(new FilterFunction<Integer>() {
            @Override
            public boolean filter(Integer integer) throws Exception {
                return integer % 2 != 0;
            }
        });

        // lambda表达式
        // SingleOutputStreamOperator<Integer> filtered = nums.filter(i -> i >= 5);
        // 如果lambda表达式比较复杂,需要添加{},同时,添加return
        SingleOutputStreamOperator<Integer> filtered = nums.filter(i -> {
            return i >= 5;
        });

        // Sink
        filtered.print();
        
        env.execute();
    }
}

keyBy算子:将DataStream类型 转化为 KeyedStream类型。逻辑上将流划分为不相交的分区。具有相同键的所有记录都分配给相同的分区。在内部,keyBy()是通过哈希分区实现的。有多种指定密钥的方法。

将DataStream类型 转化为 KeyedStream类型。

注意:在以下情况下,类型不能为键:

  1. 它是POJO类型,但不覆盖hashCode()方法,而是依赖于Object.hashCode()实现。
  2. 它是任何类型的数组。
package com.hadwinling.apitest.c02_transformation;

import com.hadwinling.apitest.beans.SensorReading;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @description:
 * @author: hadwinling
 * @time: 2021/3/29 下午4:01
 */
public class C05_KeyBy_Transformation_Demo1 {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 直接输入的就是单词
        DataStreamSource<String> words = env.socketTextStream("localhost", 7777);

        SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndOne = words.map(w -> Tuple2.of(w, 1)).returns(Types.TUPLE(Types.STRING, Types.INT));

        // 在java,认为元素是一个特殊的集合,脚标是从0开始;因为Flink底层源码是java编写的
        KeyedStream<Tuple2<String, Integer>, Tuple> keyed = wordAndOne.keyBy(0);

        keyed.print();

        env.execute();

    }
}

Reduce 算子:对键控数据流进行“滚动”压缩。

将KeyedStream类型 转化为 DataStream类型。

package com.hadwinling.apitest.c02_transformation;

import com.hadwinling.apitest.beans.SensorReading;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @description:
 * @author: hadwinling
 * @time: 2021/3/28 下午4:57
 */
public class C07_Reduce_Transformation_Demo {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 直接输入的就是单词
        DataStreamSource<String> words = env.socketTextStream("localhost", 7777);

        SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndOne = words.map(w -> Tuple2.of(w, 1)).returns(Types.TUPLE(Types.STRING, Types.INT));

        // 在java,认为元素是一个特殊的集合,脚标是从0开始;因为Flink底层源码是java编写的
        KeyedStream<Tuple2<String, Integer>, Tuple> keyed = wordAndOne.keyBy(0);

        // 计算wordcount
        SingleOutputStreamOperator<Tuple2<String, Integer>> reduced = keyed.reduce(new ReduceFunction<Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> reduce(Tuple2<String, Integer> v1, Tuple2<String, Integer> v2) throws Exception {
                //return Tuple2.of(v1.f0, v1.f1 + v2.f1);
                v1.f1 = v1.f1 + v2.f1;
                return v1;
            }
        });

        reduced.print();

        env.execute();
    }
}

聚合操作

将KeyedStream类型 转化为 DataStream类型。

1. sum 操作

package com.hadwinling.apitest.c02_transformation;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * @description:
 * @author: hadwinling
 * @time: 2021/3/29 下午4:01
 */
public class C05_KeyBy_Transformation_sum {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 直接输入的就是单词
        DataStreamSource<String> words = env.socketTextStream("localhost", 7777);

        SingleOutputStreamOperator<C5_WordCounts> wordAndOne = words.flatMap(new FlatMapFunction<String, C5_WordCounts>() {
            @Override
            public void flatMap(String s, Collector<C5_WordCounts> collector) throws Exception {
                collector.collect(new C5_WordCounts(s, 1L));
            }
        });

        KeyedStream<C5_WordCounts, Tuple> keyedStream = wordAndOne.keyBy("word");

        // 聚合
        SingleOutputStreamOperator<C5_WordCounts> sum = keyedStream.sum("counts");

        // 打印
        sum.print();

        env.execute();

    }
}

C5_WordCount

package com.hadwinling.apitest.c02_transformation;

/**
 * @description: 封装bean:
 *   如果提供了有参构造器,一定要提供一个无参构造器,要不以后反射会出问题
 * @author: hadwinling
 * @time: 2021/3/29 下午4:27
 */
public class C5_WordCounts {
    private String word;
    private Long counts;

    public C5_WordCounts() {
    }

    public C5_WordCounts(String word, Long counts) {
        this.word = word;
        this.counts = counts;
    }

    public String getWord() {
        return word;
    }

    public void setWord(String word) {
        this.word = word;
    }

    public Long getCounts() {
        return counts;
    }

    public void setCounts(Long counts) {
        this.counts = counts;
    }

    @Override
    public String toString() {
        return "C5_WordCounts{" +
                "word='" + word + '\'' +
                ", counts=" + counts +
                '}';
    }
}

2. Max操作

package com.hadwinling.apitest.c02_transformation;

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @description:  取当最当前key最大值
 * @author: hadwinling
 * @time: 2021/3/29 下午4:55
 */
public class C08_Max_Transformation {
    public static void main(String[] args) throws Exception {
        // 1. 创建环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(4);

        // spark,10
        DataStreamSource<String> lines = env.socketTextStream("localhost", 7777);

        SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndNum = lines.map(new MapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> map(String line) throws Exception {
                String[] fields = line.split(",");
                String word = fields[0];
                int num = Integer.parseInt(fields[1]);
                return Tuple2.of(word, num);
            }
        });
        KeyedStream<Tuple2<String, Integer>, Tuple> keyed = wordAndNum.keyBy(0);
        SingleOutputStreamOperator<Tuple2<String, Integer>> res = keyed.max(1);

        res.print();

        env.execute();
    }
}

常见聚合操作:

1. sum()
2. min()
3. max()
4. minBy()
5. maxBy()

min和minBy之间的区别在于min返回最小值,而minBy返回在此字段中具有最小值的元素(与max和maxBy相同)。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值