两种方式实现:原生spark的RDD算子和Spark sql实现
原生spark的RDD算子实现
package com.zero.demo
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* sparkStreaming统计窗口单词数量
*/
object SparkStreamingDemo {
/**
* scala实现
*
*/
def ScalaStreamingWC(): Unit = {
val conf = new SparkConf()
.setMaster("local[2]")
.setAppName("SparkStreamingDemo")
//straemingContext可以直接传入conf,也可以传入sparkcontext
val ssc = new StreamingContext(conf,Seconds(20))
val lines = ssc.socketTextStream("localhost",9998)
val words = lines.flatMap(_.split(" "))
val wordAddOne = words.map((_,1))
val result = wordAddOne.reduceByKeyAndWindow((a:Int,b:Int) => a+b,Seconds(40),Seconds(20))
result.print()
ssc.start()
ssc.awaitTermination()
}
def main(args: Array[String]): Unit = {
ScalaStreamingWC()
}
}
Spark sql实现
package com.zero.demo
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* sparkStreaming统计窗口单词数量
*/
object SparkStreamingDemo {
/**
* Sql实现
*/
def SqlStreamingWC(): Unit ={
val conf = new SparkConf()
.setMaster("local[2]")
.setAppName("SparkStreamingDemo")
val spark = SparkSession
.builder()
.config(conf)
.getOrCreate()
//流上下文
val ssc = new StreamingContext(spark.sparkContext,Seconds(10))
//创建套接字节流
val lines = ssc.socketTextStream("localhost",9997)
//通过flatmap压扁
val words = lines.flatMap(_.split(" "))
//通过foreachRDD遍历DStream,会作用与DStream中的每一个rdd
words.foreachRDD(
rdd => {
import spark.implicits._
val df = rdd.toDF("word")
df.createOrReplaceTempView("_tmpView")
spark.sql("select word,count(*) from _tmpView group by word").show(100,false)
})
ssc.start()
ssc.awaitTermination()
}
def main(args: Array[String]): Unit = {
SqlStreamingWC()
}
}
运行前准备,安装netcat
下载地址:https://eternallybored.org/misc/netcat/
安装参考:https://www.cnblogs.com/kukudetent/p/11696500.html
Windows下输入 nc -l -p port
例如:
nc -l -p 9997