1. 先启动nc,监听9999端口
这里我是使用nc在windows系统上开启的服务端,使用命令:
nc -lp 9999
2. 编写Spark Streaming代码
object DF_SQL_InStreaming {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount")
val ssc = new StreamingContext(conf, Seconds(5))
//从socket中获取文本类型数据
val lines = ssc.socketTextStream("10.10.12.177",9999)
//ransformations on DStreams
val words = lines.flatMap(_.split(" "))
//处理方式1:使用spark streaming进行聚合
val pairs = words.map(word => (word, 1))
//Window Operations
val windowedWordCounts = pairs.reduceByKeyAndWindow((a:Int, b:Int)=>(a+b),Seconds(60),Seconds(5))
//处理方式2:将rdd转化成dataFrame,用sql的方式进行处理
words.foreachRDD(rdd => {
val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
import spark.implicits._
val wordsDataFrame = rdd.toDF("word")
wordsDataFrame.createOrReplaceTempView("words")
spark.sql("select word, count(*) as total from words group by word").show()
})
ssc.start()
ssc.awaitTermination()
//out.close();
}
}
相关操作:
Transformations on DStreams
Window Operations
Join Operations
Output Operations on DStreams
DataFrame and SQL Operations