package com.shujia.streaming
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
object Demo05StructuredStreaming {
def main(args: Array[String]): Unit = {
//创建SparkSession
val spark: SparkSession = SparkSession
.builder()
.appName("Demo05StructuredStreaming")
.master("local[2]")
.config("spark.sql.shuffle.partitions","2")
.getOrCreate()
//nc -lk 8888
val linesDF: DataFrame = spark
.readStream
.format("socket")//以socket作为source
.option("host", "master")
.option("port", 8888)
.load()
import org.apache.spark.sql.functions._
import spark.implicits._
//wordCount
linesDF
.select(explode(split($"value",",")) as "words")
.groupBy($"words")
.agg(count("*")as "cnt")
.writeStream
/**
* outputMode 有三种模式
* Append 不能带聚合的操作 只能适用于简单的查询
* Complete 会将所有的数据都输出 只能用于带聚合的操作
* Update 会将新增或修改的数据进行输出 如果没有聚合函数 则相当于Append
*/
.outputMode(OutputMode.Complete())
.format("console")//sink到控制台
.start()
.awaitTermination()
//SQL
linesDF.createOrReplaceTempView("word_count")
//流批合一
spark.sql(
"""
|select t1.word
| ,count(*) as cnt
|from
|(
| select explode(split(value, ",")) as word
| from word_count
| ) t1 group by t1.word
|""".stripMargin)
.writeStream
.outputMode(OutputMode.Complete())
.foreachBatch((df,l)=>{
df.write.format("jdbc")
.option("url", "jdbc:mysql://master:3306/stu016?useSSL=false")
.option("dbtable", "student")
.option("user", "shujia016")
.option("password", "123456")
.mode(SaveMode.Overwrite)
.save()
})//可以使用DF的API进行处理
}
}
StructuredStreaming
最新推荐文章于 2024-02-06 14:33:31 发布