学习checkpoint使用是重启时总是获取不到重启之前的值,后来才发现用法不对
最开始都是直接创建对象用new 的方式创建对象,代码如下
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[4]").setAppName("SparkStreamingSocketTotal")
// 2创建csaprkcontext
val sc = new SparkContext(conf)
sc.setLogLevel("warn")
// 创建streamcontext
val ssc = new StreamingContext(sc,Seconds(5))
//设置checkpoint目录用于保存每一个单词在之气那的所有单词出翔的总次数
ssc.checkpoint("./socket")
// 接受socket 出现的次数
val scokettext: ReceiverInputDStream[String] = ssc.socketTextStream(ip ,9999)
// 切 分每一行数据
val words: DStream[String] = scokettext.flatMap(_.split(" "))
// m每个单词记为1
val wordone: DStream[(String, Int)] = words.map(x=>(x,1))
val result: DStream[(String, Int)] = wordone.updateStateByKey(updateFunc)
result.print()
ssc.start()
ssc.awaitTermination()
}
程序执行是数据也保存到scoket目录,显示结果累加正确,但是数据重启后总是重0开始,重启之前的数据获取不到
证据方法应该是StreamingContext.getOrCreate来获取StreamContext对象,getOrCreate先检查给定的checkpoint路径是否存在,然后读取checkpoint路径的数据,具体代码如下
def updateFunc(currentValues:Seq[Int],historyValues:Option[Int]):Option[Int] = {
val newValue: Int = currentValues.sum + historyValues.getOrElse(0)
Some(newValue)
}
var checkpointDirectory ="./socket"
def functionToCreateContext(): StreamingContext = {
val conf: SparkConf = new SparkConf().setMaster("local[4]").setAppName("SparkStreamingSocketTotal")
// 2创建csaprkcontext
val sc = new SparkContext(conf)
sc.setLogLevel("warn")
// 创建streamcontext
val ssc = new StreamingContext(sc,Seconds(5))
//设置checkpoint目录用于保存每一个单词在之气那的所有单词出翔的总次数
ssc.checkpoint("./socket")
// 接受socket 出现的次数
val scokettext: ReceiverInputDStream[String] = ssc.socketTextStream("xx.xx.xx.xx" ,9999)
// 切 分每一行数据
val words: DStream[String] = scokettext.flatMap(_.split(" "))
// m每个单词记为1
val wordone: DStream[(String, Int)] = words.map(x=>(x,1))
val result: DStream[(String, Int)] = wordone.updateStateByKey(updateFunc)
result.print()
ssc
}
def main(args: Array[String]): Unit = {
//val conf: SparkConf = new SparkConf().setMaster("local[4]").setAppName("SparkStreamingSocketTotal")
val ssc = StreamingContext.getOrCreate(checkpointDirectory, functionToCreateContext _)
ssc.start()
ssc.awaitTermination()
}
无论是本地还是集群,重启后都会获取到之前的累加数据
下面是源码Streamcontext中getOrCreate源码
def getOrCreate(
checkpointPath: String,
creatingFunc: () => StreamingContext,
hadoopConf: Configuration = SparkHadoopUtil.get.conf,
createOnError: Boolean = false
): StreamingContext = {
//调用 CheckpointReader来读取数据
val checkpointOption = CheckpointReader.read(
checkpointPath, new SparkConf(), hadoopConf, createOnError)
checkpointOption.map(new StreamingContext(null, _, null)).getOrElse(creatingFunc())
}