Flink之wordCount


一、新建maven项目并引入依赖

<!--java版依赖-->
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-java</artifactId>
    <version>${flink.version}</version>
</dependency>
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-clients_${scala.version}</artifactId>
    <version>${flink.version}</version>
</dependency>

<!--scala版依赖-->
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-scala_${scala.version}</artifactId>
    <version>${flink.version}</version>
</dependency>
<dependency>
    <groupId>org.apache.flink</groupId>
    <artifactId>flink-streaming-scala_${scala.version}</artifactId>
    <version>${flink.version}</version>
</dependency>

<!--Flink默认使用的是slf4j记录日志,使用log4j作为具体的日志实现-->
<dependency>
    <groupId>org.slf4j</groupId>
    <artifactId>slf4j-api</artifactId>
    <version>1.7.25</version>
</dependency>
<dependency>
    <groupId>org.slf4j</groupId>
    <artifactId>slf4j-log4j12</artifactId>
    <version>1.7.25</version>
</dependency>
<dependency>
    <groupId>org.apache.logging.log4j</groupId>
    <artifactId>log4j-to-slf4j</artifactId>
    <version>2.13.2</version>
</dependency>

二、Java 版

1、Batch批处理

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

public class BatchWordCount {
    public static void main(String[] args) throws Exception {
        // 1.创建执行环境
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        // 2.从文件中读取数据
        DataSource<String> inputDS = env.readTextFile("input/input.txt");
        // 3.将数据转为2元组:(word, 1)
//        FlatMapOperator<String, Tuple2<String, Long>> wordAndOne = inputDS.flatMap(new FlatMapFunction<String, Tuple2<String, Long>>() {
//            @Override
//            public void flatMap(String s, Collector<Tuple2<String, Long>> collector) throws Exception {
//                String[] words = s.split(" ");
//                for (String word : words) {
//                    collector.collect(new Tuple2<String, Long>(word, 1L));
//                }
//            }
//        });
        // 3.将数据转为2元组:(word, 1)
        FlatMapOperator<String, Tuple2<String, Long>> wordAndOne = inputDS.flatMap((String line, Collector<Tuple2<String, Long>> out) -> {
            String[] words = line.split(" ");
            for (String word : words) {
                out.collect(Tuple2.of(word, 1L));
            }
        }).returns(Types.TUPLE(Types.STRING, Types.LONG)); //当Lambda表达式使用Java泛型的时候, 由于泛型擦除的存在, 需要显示的声明类型信息;
        // 4.按照2元组的word进行group
        UnsortedGrouping<Tuple2<String, Long>> wordCountUG = wordAndOne.groupBy(0);
        // 5.分组内进行聚合统计
        AggregateOperator<Tuple2<String, Long>> wordCount = wordCountUG.sum(1);
        // 6.打印输出
        wordCount.print();
    }
}

2、DataStream流处理

import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
import java.util.Arrays;

public class DataStreamWordCount {
    public static void main(String[] args) throws Exception {
        // 1.创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1); //设置并行度
        // 2.从文件中读取数据
        DataStreamSource<String> inputDS = env.readTextFile("input/input.txt");
        // 3.将数据转为2元组:(word, 1)
        SingleOutputStreamOperator<Tuple2<String, Long>> wordAndOneDS = inputDS
                .flatMap((String line, Collector<String> words) -> { Arrays.stream(line.split(" ")).forEach(words::collect); })
                .returns(Types.STRING) //声明返回类型
                .map(word -> Tuple2.of(word, 1L))
                .returns(Types.TUPLE(Types.STRING, Types.LONG)); //声明返回类型
        // 4.按照2元组的word进行group
        KeyedStream<Tuple2<String, Long>, String> wordCountKS = wordAndOneDS.keyBy(t -> t.f0);
        // 5.分组内进行聚合统计
        SingleOutputStreamOperator<Tuple2<String, Long>> wordCount = wordCountKS.sum(1);
        // 6.打印输出
        wordCount.print();
        // 7.执行
        env.execute();
    }
}

三、Scala 版

1、Batch批处理

import org.apache.flink.api.scala._
object BatchWordCount {
  def main(args: Array[String]): Unit = {
    // 1.创建执行环境
    val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
    // 2.从文件中读取数据
    val inputDS: DataSet[String] = env.readTextFile("input/input.txt")
    // 3.将数据转为2元组:(word, 1)
    val wordAndOne: DataSet[(String, Int)] = inputDS.flatMap(_.split(" ")).map(word => (word, 1))
    // 4.按照2元组的word进行group
    val wordCountGp: GroupedDataSet[(String, Int)] = wordAndOne.groupBy(0)
    // 5.分组内进行聚合统计
    val wordCount: AggregateDataSet[(String, Int)] = wordCountGp.sum(1)
    // 6.打印输出
    wordCount.print()
  }
}

2、DataStream流处理

import org.apache.flink.streaming.api.scala._

object DataStreamWordCount {
  def main(args: Array[String]): Unit = {
    // 1.创建执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1) //设置并行度
    // 2.从文件中读取数据
    val inputDS: DataStream[String] = env.readTextFile("input/input.txt")
    // 3.将数据转为2元组:(word, 1)
    val wordAndOneDS: DataStream[(String, Int)] = inputDS.flatMap(_.split(" ")).map(word => (word, 1))
    // 4.按照2元组的word进行keyBy
    val wordCountKS: KeyedStream[(String, Int), String] = wordAndOneDS.keyBy(_._1)
    // 5.分组内进行聚合统计
    val wordCount= wordCountKS.sum(1)
    // 6.打印输出
    wordCount.print()
    // 7.执行
    env.execute()
  }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值