Spark示例——可恢复的SparkStreaming应用
关键点
- 应用从上一次停止处恢复
- SparkStreaming 流式处理
- Kafka消费
- checkpoint
代码 + 说明
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Spark可恢复的流式处理示例
*
* @author ALion
* @version 2018/10/13 10:12
*/
object RecoverableStreamDemo {
def main(args: Array[String]): Unit = {
val checkpointDirectory = "/spark/checkpoint"
val ssc = StreamingContext.getOrCreate(
checkpointDirectory, // 应用可从此处恢复
() => createContext(checkpointDirectory) // 如果无法恢复,就调用该函数创建
)
ssc.start()
ssc.awaitTermination()
}
def createContext(checkpointDirectory: String): StreamingContext = {
// 1.参数准备
val Array(batchTime, windowTime, topics, brokers) = Array(
"3", // 单次批处理时间间隔
"6", // 窗口时间
"TOPIC_PERSON", // Kafka的topic
"13.68.10.1:21005,13.68.10.2:21005,13.68.10.3:21005" // Kafka的节点
)
val batchDuration = Seconds(batchTime.toInt)
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers)
// 2.构建Context
val conf = new SparkConf().setAppName("RecoverableApp")
val ssc = new StreamingContext(conf, batchDuration)
ssc.checkpoint(checkpointDirectory) // 设置检查点,本次应用数据保存至此处。此处一定要再次设置
// 3.从Kafka消费数据
val lines = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, topicsSet)
.map(_._2)
// 过滤获取name=XiaoMing的数据信息,并打印
lines.map(x => {
val fields = x.split(",")
val time = fields(0).toLong
val name = fields(1)
val age = fields(4)
val address = fields(5)
(time, name, age, address)
}).filter(_._2 == "XiaoMing")
.print()
ssc
}
}