sparkStreaming 计算wordCount

sparkStreaming 计算wordCount

导入pom依赖

  <properties>
    <scala.version>2.12.10</scala.version>
    <spark.version>3.0.1</spark.version>
    <spark.scala.version>2.12</spark.scala.version>
    <kafka.version>2.0.0</kafka.version>
  </properties>

  <repositories>
    <repository>
      <id>scala-tools.org</id>
      <name>Scala-Tools Maven2 Repository</name>
      <url>http://scala-tools.org/repo-releases</url>
    </repository>
  </repositories>

  <pluginRepositories>
    <pluginRepository>
      <id>scala-tools.org</id>
      <name>Scala-Tools Maven2 Repository</name>
      <url>http://scala-tools.org/repo-releases</url>
    </pluginRepository>
  </pluginRepositories>

  <dependencies>
    <!-- log -->
    <dependency>
      <groupId>log4j</groupId>
      <artifactId>log4j</artifactId>
      <version>1.2.17</version>
    </dependency>
    <dependency>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-log4j12</artifactId>
      <version>1.6.1</version>
    </dependency>

    <!-- scala -->
    <dependency>
      <groupId>org.scala-lang</groupId>
      <artifactId>scala-library</artifactId>
      <version>${scala.version}</version>
    </dependency>

    <!-- spark core -->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_${spark.scala.version}</artifactId>
      <version>${spark.version}</version>
    </dependency>

    <!-- spark sql -->
    <dependency>
      <groupId>org.apache.commons</groupId>
      <artifactId>commons-lang3</artifactId>
      <version>3.9</version>
    </dependency>
    <dependency>
      <groupId>commons-io</groupId>
      <artifactId>commons-io</artifactId>
      <version>2.4</version>
    </dependency>
    <dependency>
      <groupId>org.jeecgframework.nacos</groupId>
      <artifactId>nacos-client</artifactId>
      <version>1.4.1</version>
    </dependency>
    <!--spark如果想要整合Hive,必须加入Hivez的支持-->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-hive_2.12</artifactId>
      <version>${spark.version}</version>
    </dependency>
    <!--mysql的连接驱动依赖 -->
    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>5.1.48</version>
    </dependency>

    <!--导入spark SQL的依赖-->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.12</artifactId>
      <version>3.1.2</version>
    </dependency>


<!-- spark streaming的依赖 -->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming_2.12</artifactId>
      <version>${spark.version}</version>
    </dependency>

    <!--sparkStreaming跟kafka整合的依赖 -->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
      <version>${spark.version}</version>
    </dependency>

    <!--  spark graph  -->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-graphx_2.12</artifactId>
      <version>${spark.version}</version>
    </dependency>
    <!-- web socket  -->
    <dependency>
      <groupId>org.eclipse.jetty.websocket</groupId>
      <artifactId>websocket-server</artifactId>
      <version>9.4.35.v20201120</version>
    </dependency>

    <!-- kafka -->
    <dependency>
      <groupId>org.apache.kafka</groupId>
      <artifactId>kafka_2.12</artifactId>
      <version>${kafka.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.kafka</groupId>
      <artifactId>kafka-clients</artifactId>
      <version>${kafka.version}</version>
    </dependency>
  </dependencies>

主程序

package cn.kgc.demo03

//只会产生启动之后累加的结果,会丢数据,也不会记录偏移量
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.{Milliseconds, StreamingContext}
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}

object stateStreamingWordCount {
/*
第一个参数:聚合的key,就是单词
第二个参数:当前批次产生批次该单词在每一个分区出现的次数,局部聚合的结果
第三个参数:初始值或累加的中间结果
 */

  val updateFunc=(it:Iterator[(String,Seq[Int],Option[Int])])=>{
    it.map(t=>(t._1,t._2.sum + t._3.getOrElse(0)))
  }


  def main(args: Array[String]): Unit = {
    //离线任务是创建SparkContext,现在是实现实时计算

    val conf = new SparkConf().setAppName("Streaming_wordCount").setMaster("local[*]")
  
    val ssc = new StreamingContext(conf,Milliseconds(5000))

    //如果要更新历史数据(累加,就要把中间结果保存起来)
    ssc.checkpoint("hdfs://192.168.6.160:9820//user/data/strem")

    //有了StreamingContext,就可以创建SparkStreaming的抽象DStream
                                                            // 虚拟机下输入命令nc -lk 8888,回车后输入数据,会被读取
    val lines: ReceiverInputDStream[String] = ssc.socketTextStream("192.168.6.160", 8888)


  //对DStream进行操作,操作这个抽象(代理、描述)就像操作一个集合
//方式一:reduceByKey,程序停止后只会计算新输入的数据
   lines.flatMap(_.split(" "))
      .map(x=>(x,1))      //单次与1组合
   .reduceByKey(_+_)    //聚合
        .print()

//方式二:updateStateByKey,能实现与前面数据的累计相加
    lines.flatMap(_.split(" "))
      .map(x=>(x,1))      //单次与1组合
                        //函数      //分区器                                                             //以后是否继续使用此分区器
     .updateStateByKey(updateFunc,new HashPartitioner(ssc.sparkContext.defaultParallelism),true)
      .print()              //出发Action


    //启动sparkstreaming程序
    ssc.start()


    //退出
    ssc.awaitTermination()
  }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值