概述:本文主要介绍Spark Streaming的几个应用案例。
1、统计结果写入数据库
(1)创建数据库
CREATE TABLE wordcount(
word VARCHAR(50) DEFAULT null,
wordcount int(10) DEFAULT null
);
(1)创建连接
def createConnection() = {
Class.forName("com.mysql.jdbc.Driver")
DriverManager.getConnection("jdbc:mysql://localhost:3306/sparksql?useSSL=true", "root", "root")
}
(3)代码实现
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("ForeachRDDApp").setMaster("local[2]")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val lines = ssc.socketTextStream("localhost", 6789)
val result = lines.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
//将统计结果写入数据库
//Task not serializable
// result.foreachRDD { rdd =>
// val connection = createConnection() // executed at the driver
// rdd.foreach { record =>
// val sql = "insert into wordcount(word,wordcount) values('" + record._1 + "','" + record._2 + "')"
// connection.createStatement().execute(sql)
//
// }
// }
result.print()
result.foreachRDD { rdd =>
rdd.foreachPartition { partitionOfRecords =>
val connection = createConnection()
partitionOfRecords.foreach(record => {
val sql = "insert into wordcount(word,wordcount) values('" + record._1 + "','" + record._2 + "')"
connection.createStatement().execute(sql)
})
connection.close()
}
}
ssc.start()
ssc.awaitTermination()
}
2、黑名单过滤
(1)功能分析
#访问日志 ==>DStream
20190516,zhangsan
20190516,lisi
20190516,wangwu
20190516,zhaoliu
数据转换==>(zhangsan:20190516,zhangsan),(lisi:20190516,lisi), (wangwu:20190516,wangwu),(zhaoliu:20190516,zhaoliu)
#黑名单 ==>RDD
lisi
zhaoliu
数据转换==>(lisi:true),(zhaoliu,true)
==>输出
20190516,zhangsan
20190516,wangwu
leftjoin
(zhangsan:<20190516,zhangsan>,<false>),(lisi:<20190516,lisi>,<true>), (wangwu:<20190516,wangwu>,<false>),(zhaoliu:<20190516,zhaoliu>,<true>)
(2)代码实现
object TransformApp {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TransformApp")
//创建StreamingContext
val scc = new StreamingContext(sparkConf, Seconds(5))
//构建黑名单
val blacks = List("lisi", "zhaoliu")
val blacksRDD = scc.sparkContext.parallelize(blacks).map(x => (x, true))
val lines = scc.socketTextStream("localhost", 6789)
val clicklog = lines.map(x => (x.split(",")(1), x)).transform(rdd => {
rdd.leftOuterJoin(blacksRDD).filter(x => x._2._2.getOrElse(false) != true).map(x => x._2._1)
})
clicklog.print()
scc.start()
scc.awaitTermination()
}
}
3、Spark Stream与Spark SQL操作实现字符统计
(1)导入依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.2</version>
</dependency>
(2)代码实现
object SqlNetworkWordCount {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("SqlNetworkWordCount").setMaster("local[2]")
val ssc = new StreamingContext(sparkConf, Seconds(2))
val lines = ssc.socketTextStream("localhost", 6789)
val words = lines.flatMap(_.split(" "))
// Convert RDDs of the words DStream to DataFrame and run SQL query
words.foreachRDD { (rdd: RDD[String], time: Time) =>
// Get the singleton instance of SparkSession
val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
import spark.implicits._
// Convert RDD[String] to RDD[case class] to DataFrame
val wordsDataFrame = rdd.map(w => Record(w)).toDF()
// Creates a temporary view using the DataFrame
wordsDataFrame.createOrReplaceTempView("words")
// Do word count on table using SQL and print it
val wordCountsDataFrame =
spark.sql("select word, count(*) as total from words group by word")
println(s"========= $time =========")
wordCountsDataFrame.show()
}
ssc.start()
ssc.awaitTermination()
}
case class Record(word: String)
/** Lazily instantiated singleton instance of SparkSession */
object SparkSessionSingleton {
@transient private var instance: SparkSession = _
def getInstance(sparkConf: SparkConf): SparkSession = {
if (instance == null) {
instance = SparkSession
.builder
.config(sparkConf)
.getOrCreate()
}
instance
}
}
}
统计结果