概述
StructuredSreaming它支持数据帧/数据集上的大多数常见操作,我们之前学到的对RDD、对DataFram、对DataSet的操作都可以在StructuredSreaming上使用
分类
DSL
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object wordCount_DSL {
def main(args: Array[String]): Unit = {
//TODO 0.创建环境
//因为StructuredStreaming基于SparkSQL的且编程API/数据抽象是DataFrame/DataSet,所以这里创建SparkSession即可
val spark: SparkSession = SparkSession
.builder()
.appName("sparksql")
.master("local[*]")
.config("spark.sql.shuffle.partitions", "4")//本次测试时将分区数设置小一点,实际开发中可以根据集群规模调整大小,默认200
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
import spark.implicits._
//TODO 1.加载数据
val df: DataFrame = spark.readStream
.format("socket")
.option("host", "node1")
.option("port", 9999)
.load()
df.printSchema()
/*
root
|-- value: string (nullable = true)
*/
//df.show()// Queries with streaming sources must be executed with writeStream.start();
//TODO 2.处理数据--DSL
val ds: Dataset[String] = df.as[String]
val result: Dataset[Row] = ds.flatMap(_.split(" "))
.groupBy($"value")
.count()
.orderBy($"count".desc)
//TODO 3.输出结果
result.writeStream
.format("console")
.outputMode("complete")//complete:每次有一些更新时,流数据帧/数据集中的所有行都将写入接收器
//TODO 4.启动并等待结束
.start()
.awaitTermination()
//TODO 5.关闭资源
spark.stop()
}
}
SQL
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object wordCount_SQL {
def main(args: Array[String]): Unit = {
//TODO 0.创建环境
//因为StructuredStreaming基于SparkSQL的且编程API/数据抽象是DataFrame/DataSet,所以这里创建SparkSession即可
val spark: SparkSession = SparkSession
.builder()
.appName("sparksql")
.master("local[*]")
.config("spark.sql.shuffle.partitions", "4")//本次测试时将分区数设置小一点,实际开发中可以根据集群规模调整大小,默认200
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
import spark.implicits._
//TODO 1.加载数据
val df: DataFrame = spark.readStream
.format("socket")
.option("host", "node1")
.option("port", 9999)
.load()
df.printSchema()
/*
root
|-- value: string (nullable = true)
*/
//df.show()// Queries with streaming sources must be executed with writeStream.start();
//TODO 2.处理数据---SQL
//2.1:创建临时视图
df.createOrReplaceTempView("tmp_df")
val result: DataFrame = spark.sql(
"""
|select value,count(*) as count
|from tmp_df
|group by value
|""".stripMargin)
//TODO 3.输出结果
result.writeStream
.format("console")
.outputMode("complete")//complete:每次有一些更新时,流数据帧/数据集中的所有行都将写入接收器
//TODO 4.启动并等待结束
.start()
.awaitTermination()
//TODO 5.关闭资源
spark.stop()
}
}