SparkStreaming之WordCount案例(累计统计)(二)
一、案例简介
使用 netcat 工具向 9999 端口不断的发送数据,通过 SparkStreaming 读取端口数据并累计统计不同单词出现的次数 。
二、netcat操作
1、虚拟机中安装netcat
[root@hadoop1 spark]# yum install -y nc
2、启动程序并发送数据
[root@hadoop1 spark]# nc -lk 9999
三、代码实现
1、maven依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.1</version>
</dependency>
2、 java代码
package com.it.sparkStreaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
object SparkStreamingState {
def main(args: Array[String]): Unit = {
//创建StreamingContext对象
val sparkConf = new SparkConf().setAppName("SparkStreamingState").setMaster("local[*]")
val sc: StreamingContext = new StreamingContext(sparkConf,Seconds(5))
//设置checkpoint,缓存数据
sc.checkpoint("cp")
//创建ReceiverInputDStream ,输入流
val receiver: ReceiverInputDStream[String] = sc.socketTextStream("hadoop1",9999)
//创建DStream,处理接收的数据
val wordOne: DStream[(String, Int)] = receiver.flatMap(_.split(" ")).map((_,1))
//汇总接收的数据,updateStateByKey
val dstream: DStream[(String, Int)] = wordOne.updateStateByKey((p1: Seq[Int], p2: Option[Int]) => {
val current = p1.sum
val next = p2.getOrElse(0)
Some(current + next)
})
//处理汇总数据,reduceByKey
val result: DStream[(String, Int)] = dstream.reduceByKey(_+_)
result.print()
//启动StreamingContext
sc.start()
sc.awaitTermination()
}
}