Flink DataStream流及Batch单词计数

最新推荐文章于 2022-10-05 14:32:09 发布

蜗牛杨哥

最新推荐文章于 2022-10-05 14:32:09 发布

阅读量1.1k

点赞数

分类专栏： Flink流式及批量统计单词

本文链接：https://blog.csdn.net/u014635374/article/details/105764486

版权

Flink流式及批量统计单词专栏收录该内容

1 篇文章 0 订阅

订阅专栏

实战一:通过socket模拟产生单词数据

场景：需要实现每隔1秒对最近2秒内的数据进行汇总计算

package org.jy.data.yh.bigdata.streaming.flink;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;
/**
 * 流式统计单词
 * 滑动窗口计算
 * 通过socket模拟产生单词数据
 * flink对数据进行统计计算
 * 需要实现每隔1秒对最近2秒内的数据进行汇总计算
 * Created by yanghong
 * [hadoop@centoshadoop1 ~]$ nc -l 9000  需要执行该命令,否则自动报错连接不上，而退出
 */
public class SocketWindowWordCountJava {
    public static void main( String[] args ) throws Exception {
        // 获取需要的端口好
        int port;
        try{
            ParameterTool parameterTool = ParameterTool.fromArgs(args);
            port = parameterTool.getInt("port");
        }catch (Exception e){
            System.err.println("No port set. use default port 9000--java");
            port = 9000;
        }
        // 获取flink的运行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        String hostname = "centoshadoop1";
       // 定界符，分隔符
        // 连接socket获取输入的数据
        String delimiter = "\n";
        //连接socket获取输入的数据
        DataStreamSource<String> text = env.socketTextStream(hostname, port, delimiter);

        DataStream<WordWithCount> windowCounts = text.flatMap(new FlatMapFunction<String, WordWithCount>() {
            @Override
            public void flatMap(String value, Collector<WordWithCount> collector) throws Exception {
                String[] splits = value.split("\\s");
                for(String word : splits){
                    collector.collect(new WordWithCount(word,1L));
                }
            }
        }).keyBy("word")
                .timeWindow(Time.seconds(600),Time.seconds(1)) //指定时间窗口大小为2秒，指定时间间隔为1秒  只对追加2秒钟收集
                .sum("count");//在这里使用sum或者reduce都可以
        /*.reduce(new ReduceFunction<WordWithCount>() {
                                    public WordWithCount reduce(WordWithCount a, WordWithCount b) throws Exception {
                                        return new WordWithCount(a.word,a.count+b.count);
                                    }
                                })*/
        //把数据打印到控制台并且设置并行度
        windowCounts.print().setParallelism(1);

        //这一行代码一定要实现，否则程序不执行
        env.execute("Socket window count");
    }
    public static class WordWithCount{
        public String word;
        public long count;
        public  WordWithCount(){}
        public WordWithCount(String word,long count){
            this.word = word;
            this.count = count;
        }
        @Override
        public String toString() {
            return "WordWithCount{" +
                    "word='" + word + '\'' +
                    ", count=" + count +
                    '}';
        }
    }
}

scala版

package com.yh.bigdata.flink.streaming

import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.windowing.time.Time
;

/**
  * 流解析
  * 需求: 实现每隔1s对最近2内的数据进行汇总计算
  * 分析: 通过socket模拟产生单词,使用Flink程序对数据信息汇总计算
  */
object SocketWindowWordCount extends App {
    override def main(args: Array[String]): Unit = {
        // 获取socket的端口号
        val tmpPort = "9000"
        val port = try{
            ParameterTool.fromArgs(Array(tmpPort)).getInt("port")
        }catch {
            case e:Exception => {
                System.err.println("No port set. use default port 9000--scala")
            }
            9000   // 默认值
        }
        // 获取运行环境
        val evn: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
        // 连接socket获取输入的数据
        val text = evn.socketTextStream("centoshadoop1",port,'\n')
          // 解析数据(把数据打平),分组，窗口计算,并且集合求sum
        // 注意:必须添加这一行隐式转换，否则下面的FlatMap方法执行会报错
        import org.apache.flink.api.scala._

        val windowWordCount = text.flatMap(line => line.split("\\s")) // 打平,把每一行单词都切开
          .map(word => WordWithCount(word,1)) // 把单词转成word,1这种形式
          .keyBy("word") // 分组
          .timeWindow(Time.seconds(10),Time.seconds(1))  // 指定窗口大小,指定间隔时间10内单词计算
          .sum("count") // sum 或者reduce都可以
        // .reduce(a,b)=>WordWithCount(a.word,a.count+b.count)
        // 打印到控制台
        windowWordCount.print().setParallelism(1)
        // 执行任务
        evn.execute("Socket window count")
    }

    case class WordWithCount(word: String, count: Long)
}

================================================================================

实战二:批量离线处理：统计单词

需求:统计一个文件中的单词出现的总次数,并且把统计的结果存入文本文件中

package org.jy.data.yh.bigdata.batch.flink;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

/**
 * 批量离线处理：统计单词
 * 需求:统计一个文件中的单词出现的总次数,并且把统计的结果存入文本文件中
 * 输出的result文件的内容为:
 * hong 939
 hello 2343
 world 1404
 hadoopyang 463
 yang 476
 hadoop 1416
 bye 1903
 */
public class BatchWordCountJava {
    public static void main(String[] args) throws Exception {
       String inputPath = "D:/jar/inputPath";  // 目录 ，下面创建一个需要统计的文本文件，加入以下内容
       String outPath ="D:/jar/result";  // 输出到jar目录下的result文件,该文件不需要创建
       // 获取运行环境
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        // 获取文件中的内容
        DataSource<String> text = env.readTextFile(inputPath);

        DataSet<Tuple2<String,Integer>> counts = text.flatMap(new Tokenizer())
                .groupBy(0).sum(1);
        counts.writeAsCsv(outPath,"\n"," ").setParallelism(1);
        env.execute("batch word count");
    }

    private static class Tokenizer implements FlatMapFunction<String,Tuple2<String,Integer>>{  // 分词解析解析器
        @Override
        public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
               String[] tokens = value.toLowerCase().split("\\W+");
               for(String token : tokens){
                   if(token.length() >0 ){
                       out.collect(new Tuple2<String,Integer>(token,1));
                   }
               }
        }
    }
}

scala版本

package com.yh.bigdata.flink.batch

import org.apache.flink.api.scala.ExecutionEnvironment

/**
  * 单词计算之流式计算
  */
object BatchWordCountScala {
  def main(args: Array[String]): Unit = {
    val inputPath="d:/jar/inputPath"
    val outPath = "d:/jar/result"
    val env = ExecutionEnvironment.getExecutionEnvironment  // 这是一个方法

    val text = env.readTextFile(inputPath)

    // 引入隐式转换
    import org.apache.flink.api.scala._
    val counts = text.flatMap(_.toUpperCase().split("\\W+"))
      .filter(_.nonEmpty)
      .map((_,1))
      .groupBy(0)
      .sum(1)
    counts.writeAsCsv(outPath,"\n"," ")
    env.execute("batch word count")
  }

}