通过netcat产生实时数据源
安装netcat:yum install nmap-ncat.x86_64
设置端口:nc -lk 1234
spark streaming的java代码
pom文件中需要添加的依赖:
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.4.4</version>
</dependency>
设置微批处理的时间为5秒:
object NcWordCountTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("wc")
val ssc = new StreamingContext(conf, Seconds(5))
val line = ssc.socketTextStream("192.168.181.132", 1234)
line.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _).print()
ssc.start()
ssc.awaitTermination()
}
}
运行java代码,然后再Linux的窗口下输入文字,上面的代码只能实现五秒内的数据进行聚合统计,不能进行全局的统计
全局统计的代码优化(核心是checkpoint和updateStateByKey):
在D盘新建一个文件夹ck用来存放每一个RDD之前的聚合结果
object NcWordCountTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("wc")
val ssc = new StreamingContext(conf, Seconds(5))
ssc.checkpoint("d:/ck")
val line = ssc.socketTextStream("192.168.181.132", 1234)
line.flatMap(_.split(" ")).map((_, 1))
//currval:Seq[Int]现在RDD的key的值的序列,prevstate:Option[Int]之前所有RDD的key的值
.updateStateByKey((currval:Seq[Int],prevstate:Option[Int])
=>{
val newVal = currval.sum
val oldValue = prevstate.getOrElse(0)
Some(newVal+oldValue)
}).reduceByKey(_ + _).print()
ssc.start()
ssc.awaitTermination()
}
}
【添加窗口】
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
object MyWindow {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("wn").setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds(3))
ssc.checkpoint("d:/ck")
val ds = ssc.socketTextStream("192.168.181.132", 9000)
val wd = ds.flatMap(_.split(" "))
//滑动窗口实现窗口内reducebykey
val ww= wd.map((_,1)).reduceByKeyAndWindow((x:Int,y:Int)=>x+y,Seconds(9), Seconds(3))
ww.transform(x=>{
val tp = x.map(tp => (tp._2, tp._1))
val pp = tp.sortByKey(false)
pp.take(3).map(t=>(t._2,t._1)).foreach(println)
x
}).print()
ssc.start()
ssc.awaitTermination()
}
}