此实例为官网的整合实例,仅供参考。
package com.fyy.spark.streaming
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext, Time}
/**
* @Title: SqlAndStreaming
* @ProjectName SparkStreamingProject
* @Description: Spark Streaming整合Spark SQL进行词频统计操作
* @author fanyanyan
*/
object SqlAndStreaming {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("SqlAndStreaming").setMaster("local[*]")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val lines = ssc.socketTextStream("01.server.bd", 6666)
val words = lines.flatMap(_.split(" "))
// 将words DStream的RDD转换为DataFrame并运行SQL查询
words.foreachRDD { (rdd: RDD[String], time: Time) =>
val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
import spark.implicits._
// 将RDD[String]转换为RDD[case class]在形成DataFrame
val wordsDataFrame = rdd.map(w => Record(w)).toDF()
// 为DataFrame创建一个临时的视图
wordsDataFrame.createOrReplaceTempView("words")
// 通过sql进行统计
val wordCountsDataFrame =
spark.sql("select word, count(*) as num from words group by word")
println(s"=========分隔符=========")
wordCountsDataFrame.show()
}
ssc.start()
ssc.awaitTermination()
}
/** Case class for converting RDD to DataFrame */
case class Record(word: String)
/** 延迟实例化SparkSession的单例实例 */
object SparkSessionSingleton {
@transient private var instance: SparkSession = _
def getInstance(sparkConf: SparkConf): SparkSession = {
if (instance == null) {
instance = SparkSession
.builder
.config(sparkConf)
.getOrCreate()
}
instance
}
}
}