Flink学习笔记(1)——流批处理


学习flink怎么能少得了wordcount呢?

pom文件

flink1.0版本及以下

<properties>
        <flink.version>1.10.0</flink.version>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.11</artifactId>
            <version>${flink.version}</version>
        </dependency>
     <dependencies> 

flink1.11版本及以上,需要加上flink-clients,不然会报错No ExecutorFactory found to execute the application

<properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <flink.version>1.12.1</flink.version>
        <scala.binary.version>2.12</scala.binary.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>
    </dependencies>

批处理

package wc;


import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

/**
 * Created with IntelliJ IDEA.
 *
 * @Author: yingtian
 * @Date: 2021/04/27/16:32
 * @Description: 批处理统计数据
 */
public class WordCountSet {

    public static void main(String[] args) throws Exception {
        //创建执行环境
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

        //读取数据
        String inputPath = WordCountSet.class.getClassLoader().getResource("hello.txt").getFile();
        DataSet<String> dataStream = env.readTextFile(inputPath);

        // 对数据集进行处理,按空格分词展开,转换成(word, 1)二元组进行统计
        DataSet<Tuple2<String, Integer>> resultSet = dataStream.flatMap(new MyFlatMapper())
                .groupBy(0) // 按照第一个位置的word分组
                .sum(1); // 按照第二个位置上的数据求和

        //打印结果
        resultSet.print();

    }

    public static class MyFlatMapper implements FlatMapFunction<String,Tuple2<String,Integer>> {

        @Override
        public void flatMap(String line, Collector<Tuple2<String, Integer>> collector) throws Exception {
            String[] arr = line.split(" ");
            for(String str : arr){
                collector.collect(new Tuple2<>(str,1));
            }
        }
    }


}

输出结果:

# 一次输出
(are,2)
(you,2)
(flink,2)
(nice,1)
(world,2)
(good,1)
(hello,4)
(how,2)

流处理

package wc;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 1. Created with IntelliJ IDEA.
 2.  3. @Author: yingtian
 4. @Date: 2021/04/27/17:48
 5. @Description:
 */
public class WordCountStream {

    public static void main(String[] args) throws Exception {
        //创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        //读取数据
        String inputPath = WordCountSet.class.getClassLoader().getResource("hello.txt").getFile();
        DataStream<String> dataStream = env.readTextFile(inputPath);

        // 对数据集进行处理,按空格分词展开,转换成(word, 1)二元组进行统计
        DataStream<Tuple2<String, Integer>> resultStream = dataStream.flatMap(new MyFlatMapper())
//                .keyBy(0) //取下标方法已过期
                .keyBy(item -> item.f0) // 按照第一个位置的word分组
                .sum(1);// 按照第二个位置上的数据求和

        resultStream.print();
        env.execute();
    }

    public static class MyFlatMapper implements FlatMapFunction<String,Tuple2<String,Integer>>{

        @Override
        public void flatMap(String line, Collector<Tuple2<String, Integer>> collector) throws Exception {
            String[] arr = line.split(" ");
            for(String str : arr){
                collector.collect(new Tuple2<>(str,1));
            }
        }
    }
}

输出结果:

3> (world,1)
2> (hello,1)
4> (flink,1)
4> (flink,2)
3> (how,1)
3> (you,1)
3> (how,2)
3> (you,2)
3> (nice,1)
3> (good,1)
3> (world,2)
2> (hello,2)
2> (are,1)
2> (are,2)
2> (hello,3)
2> (hello,4)

总结

  1. 批处理执行环境为ExecutionEnvironment,流处理执行环境为StreamExecutionEnvironment
  2. 批处理=>几组或所有数据到达后才处理;流处理=>有数据来就直接处理,不等数据堆叠到一定数量级
  3. 批处理分组使用group by,流处理分组使用keyby
  4. 1.12版本源码中用下标位置来keyby方法已过期,推荐使用选择器
    在这里插入图片描述
  5. 并行度:开发环境的并行度默认就是计算机的CPU逻辑核数

ps:以上内容整理于SGG教程。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值