首先定义一个trait StreamingExample继承于App,设置日志级别为WARN,定义几个常量子类使用。
trait StreamingExample extends App {
Logger.getRootLogger.setLevel(Level.WARN)
val hostname = "localhost"
val port = 9999
val checkpointDir = "spark_streaming/checkpoint"
val outputDir = "spark_streaming/output"
val inputDir = "spark_streaming/input"
}
1、从HDFS消费数据
object HdfsWordCount extends StreamingExample {
val sparkConf = new SparkConf().setAppName("HdfsWordCount")
val ssc = new StreamingContext(sparkConf, Seconds(2))
val lines = ssc.textFileStream(inputDir)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print()
ssc.start()
ssc.awaitTermination()
}
2、从指定TCP端口消费数据
/**
* $ nc -lk 9999
* 功能:从SocketInputDStream消费数据(指定IP和端口)
**/
object NetworkWordCount extends StreamingExample {
//相当于启动两个线程,一个给receiver,一个给computer,一个序列化
val sparkConf = new SparkConf().setMaster("local[3]").setAppName("NetworkWordCount")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val lines = ssc.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print()
ssc.start()
ssc.awaitTermination()
}
3、进程内消费数据
/**
* 功能:从QueueInputDStream消费数据
*/
object QueueStream extends StreamingExample {
//1、设置参数
val sparkConf = new SparkConf().setMaster("local[3]").setAppName("QueueStream")
//2、创建StreamingContext
val ssc = new StreamingContext(sparkConf, Seconds(2))
val rddQueue = new mutable.Queue[RDD[Int]]()
//3、指定输入
val inputStream = ssc.queueStream(rddQueue)
//4、构造工作流(DAG)
val mappedStream = inputStream.map(x => (x % 10, 1))
val reducedStream = mappedStream.reduceByKey(_ + _)
reducedStream.print()
//5、启动应用程序
ssc.start()
//构造rddQueue
for (i <- 1 to 30) {
rddQueue.synchronized {
rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10)
}
Thread.sleep(1000)
}
//6、关闭应用程序
ssc.stop()
}
4、从Kafka消费数据
object DirectKafkaWordCount extends StreamingExample {
val Array(brokers, groupId, topics) = args
// Create context with 2 second batch interval
val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount")
val ssc = new StreamingContext(sparkConf, Seconds(2))
// Create direct kafka stream with brokers and topics
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
ConsumerConfig.GROUP_ID_CONFIG -> groupId,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer])
val messages = KafkaUtils.createDirectStream[String, String](
ssc,
//三种调度consumer的策略(分区分配):1、PreferConsistent 2、PreferBrokers 3、PreferFixed
LocationStrategies.PreferConsistent,
//三种消费策略:1、Subscribe:指定topic 2、SubscribePattern:通过正则来匹配topic(动态新增要消费的topic) 3、Assign:指定topic和partition
ConsumerStrategies.Subscribe[String, String](topicsSet, kafkaParams))
// Get the lines, split them into words, count the words and print
val lines = messages.map(_.value)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
wordCounts.print()
// Start the computation
ssc.start()
ssc.awaitTermination()
}
5、自定义Receiver
object CustomReceiver extends StreamingExample {
val sparkConf = new SparkConf().setAppName("CustomReceiver")
val ssc = new StreamingContext(sparkConf, Seconds(1))
val lines = ssc.receiverStream(new CustomReceiver(hostname, port))
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
wordCounts.print()
ssc.start()
ssc.awaitTermination()
}
class CustomReceiver(host: String, port: Int)
extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) {
def onStart() {
new Thread("Socket Receiver") {
override def run() {
receive()
}
}.start()
}
def onStop() {
//todo
}
/** Create a socket connection and receive data until receiver is stopped */
private def receive() {
var socket: Socket = null
var userInput: String = null
try {
println(s"Connecting to $host : $port")
socket = new Socket(host, port)
println(s"Connected to $host : $port")
val reader = new BufferedReader(
new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8))
userInput = reader.readLine()
while (!isStopped && userInput != null) {
store(userInput)
userInput = reader.readLine()
}
reader.close()
socket.close()
println("Stopped receiving")
restart("Trying to connect again")
} catch {
case e: java.net.ConnectException =>
restart(s"Error connecting to $host : $port", e)
case t: Throwable =>
restart("Error receiving data", t)
}
}
}
6、通过Spark SQL计算WordCount
object SqlNetworkWordCount extends StreamingExample {
val sparkConf = new SparkConf().setMaster("local[3]").setAppName("SqlNetworkWordCount")
val ssc = new StreamingContext(sparkConf, Seconds(2))
val lines = ssc.socketTextStream(hostname, port, StorageLevel.MEMORY_AND_DISK_SER)
val words = lines.flatMap(_.split(" "))
// Convert RDDs of the words DStream to DataFrame and run SQL query
words.foreachRDD { (rdd: RDD[String], time: Time) =>
// 获取SparkSession
val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
import spark.implicits._
// RDD[String] -> RDD[case class] -> DataFrame
val wordsDataFrame = rdd.map(w => Record(w)).toDF()
// 创建临时视图:words
wordsDataFrame.createOrReplaceTempView("words")
// 执行SQL
val wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word")
// 打印执行结果
println(s"========= $time =========")
wordCountsDataFrame.show()
}
ssc.start()
ssc.awaitTermination()
}
case class Record(word: String)
object SparkSessionSingleton {
@transient private var instance: SparkSession = _
def getInstance(sparkConf: SparkConf): SparkSession = {
if (instance == null) {
instance = SparkSession
.builder
.config(sparkConf)
.getOrCreate()
}
instance
}
}
7、使用mapWithState累加每个RDD的单词数
/**
* 功能:从SocketInputDStream消费数据,计算累计WordCount
**/
object StatefulNetworkWordCount extends StreamingExample {
val sparkConf = new SparkConf().setMaster("local[4]").setAppName("StatefulNetworkWordCount")
val ssc = new StreamingContext(sparkConf, Seconds(1))
ssc.checkpoint("checkpoint")
// Initial state RDD for mapWithState operation
val initialRDD = ssc.sparkContext.parallelize(List(("hello", 1), ("world", 1)))
val lines = ssc.socketTextStream(hostname, port)
val words = lines.flatMap(_.split(" "))
val wordDstream = words.map(x => (x, 1))
// 计算Word的累计数据,通过State保留历史值
val mappingFunc = (word: String, one: Option[Int], state: State[Int]) => {
val sum = one.getOrElse(0) + state.getOption.getOrElse(0)
val output = (word, sum)
state.update(sum)
output
}
val stateDstream = wordDstream.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD))
stateDstream.print()
ssc.start()
ssc.awaitTermination()
}
8、从checkpoint中重建应用程序
/**
* 功能:演示如何从checkpoint中重建应用程序及broadcast和accumulator的用法
**/
object RecoverableNetworkWordCount extends StreamingExample {
// 如果设置了checkpoint,则从checkpoint中恢复应用程序;否则,重新创建应用程序
val ssc = StreamingContext.getOrCreate(checkpointDir,
() => createContext(hostname, port, outputDir, checkpointDir))
ssc.start()
ssc.awaitTermination()
def createContext(ip: String, port: Int, outputPath: String, checkpointDirectory: String): StreamingContext = {
println("Creating new context .....")
//创建输出文件夹
val outputFile = new File(outputPath)
if (outputFile.exists()) outputFile.delete()
val sparkConf = new SparkConf().setMaster("local[2]").setAppName("RecoverableNetworkWordCount")
val ssc = new StreamingContext(sparkConf, Seconds(3))
ssc.checkpoint(checkpointDirectory)
val lines = ssc.socketTextStream(ip, port)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map((_, 1)).reduceByKey(_ + _)
wordCounts.foreachRDD { (rdd: RDD[(String, Int)], time: Time) =>
// 获取黑名单
val blacklist = WordBlacklist.getInstance(rdd.sparkContext)
// 黑名单计数器
val droppedWordsCounter = DroppedWordsCounter.getInstance(rdd.sparkContext)
// 排除黑名单单词
val counts = rdd.filter { case (word, count) =>
if (blacklist.value.contains(word)) {
droppedWordsCounter.add(count)
false
} else {
true
}
}.collect().mkString("[", ", ", "]")
// 输出过滤后的word
val output = s"Counts at time $time $counts"
println(output)
println(s"Dropped ${droppedWordsCounter.value} word(s) totally")
println(s"Appending to ${outputFile.getAbsolutePath}")
Files.append(output + "\n", outputFile, Charset.defaultCharset())
}
ssc
}
}
object WordBlacklist {
@volatile private var instance: Broadcast[Seq[String]] = null
def getInstance(sc: SparkContext): Broadcast[Seq[String]] = {
if (instance == null) {
synchronized {
if (instance == null) {
val wordBlacklist = Seq("a", "b", "c")
instance = sc.broadcast(wordBlacklist)
}
}
}
instance
}
}
object DroppedWordsCounter {
@volatile private var instance: LongAccumulator = _
def getInstance(sc: SparkContext): LongAccumulator = {
if (instance == null) {
synchronized {
if (instance == null) {
instance = sc.longAccumulator("WordsInBlacklistCounter")
}
}
}
instance
}
}