示意图
- sparkstreaming实时处理DStream,SparkSQL批处理DataFrame
- sparkstreaming处理的DStream内部封装有RDD,通过
.toRDD
将DStream转换成RDD - SpqrkSQL处理的DataFrame内部同样封装有RDD,DataFrame通过注册成表,被SparkSQL处理
- 所以本质上SpqrkSQL处理的还是DStream
代码
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object scToSQL {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("scToSQL").setMaster("local[2]")
val sc = new StreamingContext(conf,Seconds(5))
val initDS: ReceiverInputDStream[String] = sc.socketTextStream("node01",9999)
val words: DStream[String] = initDS.flatMap(_.split(" "))
words.foreachRDD(rdd=>{
rdd.foreach(println)
})
//对DStream处理,将RDD转换成DataFrame
words.foreachRDD(rdd=>{
//获得sparkSession对象,用来将RDD转换成sparkSession对象
val sparkSession: SparkSession = SparkSession.builder().config(rdd.sparkContext.getConf).getOrCreate()
import sparkSession.implicits._
//将RDD转换成DataFrame
val dataFrame: DataFrame = rdd.toDF("word")
//将DataFrame注册成表
dataFrame.createOrReplaceTempView("words")
val result: DataFrame = sparkSession.sql("select word,count(*) as count from words group by word")
result.show()
})
sc.start()
sc.awaitTermination()
}
}
输入
结果