1,因业务需要,需要实时按天统计数据,pv,uv,同时第二天清零重新计算,主要分两步
2,自定义source数据,也可以直接读取kafka的数据。
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.receiver.Receiver
import scala.reflect.ClassTag
import scala.util.Random
/**
* @Author: wpp
* @Date: 2020/5/4 23:36
*
*/
//自定义数据源 wpp
class CustomSourceWindowByFitValue() extends Receiver[String](StorageLevel.MEMORY_ONLY) {
val myArray = Array(1,2,3,5,7,8,9,10,11,12)
//最初启动的时候,调用该方法,作用为:读取数据并将数据发送给spark
override def onStart(): Unit = {
new Thread("Socket Receiver") {
override def run(): Unit = {
while (true){
val res= takeSample(myArray,1,System.currentTimeMillis)
// println(res.toString+",1")
store( res.toString+",1")
Thread.sleep(50)
}
}
}.start()
}
def takeSample[T:ClassTag](a:Array[T],n:Int,seed:Long) = {
val rnd = new Random(seed)
a(rnd.nextInt(a.size))
}
override def onStop(): Unit = {}
}
2,通过 mapWithState进行相关的状态统计,
import com.crgt.gtdata.custom.CustomSourceWindowByFitValue
import com.crgt.gtdata.window.KafkaToRedis.{conf, resetTime}
import org.apache.commons.lang3.time.DateFormatUtils
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, State, StateSpec, StreamingContext}
/**
* @Author: wpp
* @Date: 2020/5/21 14:58
* 利用 while True 和 awaitTerminationOrTimeout 实现任务重启
* 电脑上把时间调整到23:58分,让他自动执行一两分钟跨时间,等到00:02分,会自动重启任务
**/
object OnlineHotItemsDay {
def main(args: Array[String]): Unit = {
//创建StreamingContext对象
val sparkConf = new SparkConf().setAppName("OnlineHotItemsDay").setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val mapFunction = (shopid: String, pv: Option[Int], state: State[Int]) => {
val accSum = pv.getOrElse(0) + state.getOption().getOrElse(0)
val output = (shopid,accSum)
state.update(accSum)
output
}
val stateSpec = StateSpec.function(mapFunction)
while (true) {
val ssc = new StreamingContext(sc, Seconds(10))
ssc.checkpoint("file:///D:/checkpoint/todaycnt/") //不能写成 ssc.checkpoint("./") ,会报错
val hottestStream = ssc.receiverStream(new CustomSourceWindowByFitValue())
/**
* 用户搜索的格式简化为item,time 在这里我们由于要计算出热点内容,所以只需要取出item即可
* 提取出的item然后通过map转换为(item,1)格式
*/
val searchPair = hottestStream.map(_.split(",")(0))
.filter(!_.isEmpty)
.map(item => (item, 1))
// searchPair.print()
val shopTrafficUpdateStateDStream = searchPair.mapWithState(stateSpec).stateSnapshots()
shopTrafficUpdateStateDStream.foreachRDD {
rdd => {
rdd.top(5)
.foreach(item => println(item))
println("=====================" + DateFormatUtils.format(System.currentTimeMillis(), "YYYY-MM-dd HH:mm:ss"))
}
}
ssc.start()
ssc.awaitTerminationOrTimeout(resetTime)
ssc.stop(false,true)
}
}
}
3, 电脑上把时间调整到23:58分,让他自动执行一两分钟跨时间,等到00:02分,会自动重启任务。执行结果如下:
=====================2020-05-31 23:58:51
(9,21)
(8,14)
(7,14)
(5,36)
(3,8)
=====================2020-05-31 23:59:02
(9,46)
(8,34)
(7,24)
(5,59)
(3,28)
=====================2020-05-31 23:59:11
(9,62)
(8,61)
(7,35)
(5,75)
(3,57)
=====================2020-05-31 23:59:20
(9,82)
(8,83)
(7,59)
(5,97)
(3,72)
=====================2020-05-31 23:59:31
(9,102)
(8,105)
(7,73)
(5,123)
(3,87)
=====================2020-05-31 23:59:41
(9,119)
(8,127)
(7,89)
(5,145)
(3,113)
=====================2020-05-31 23:59:51
2020-06-01 00:00:00,012 ERROR --- [ dispatcher-event-loop-0] org.apache.spark.streaming.scheduler.ReceiverTracker (line: 70) : Deregistered receiver for stream 0: Stopped by driver
Exception in thread "Socket Receiver" org.apache.spark.SparkException: Cannot add data as BlockGenerator has not been started or has been stopped
at org.apache.spark.streaming.receiver.BlockGenerator.addData(BlockGenerator.scala:173)
at org.apache.spark.streaming.receiver.ReceiverSupervisorImpl.pushSingle(ReceiverSupervisorImpl.scala:120)
at org.apache.spark.streaming.receiver.Receiver.store(Receiver.scala:119)
at com.crgt.gtdata.custom.CustomSourceWindowByFitValue$$anon$1.run(CustomSourceWindowByFitValue.scala:27)
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:00:00
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:00:10
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:00:20
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:00:30
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:00:40
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:00:50
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:01:00
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:01:10
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:01:20
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:01:30
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:01:40
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:01:50
(9,139)
(8,141)
(7,116)
(5,163)
(3,119)
=====================2020-06-01 00:02:00
(9,16)
(8,14)
(7,22)
(5,21)
(3,19)
=====================2020-06-01 00:02:11
(9,35)
(8,36)
(7,48)
(5,31)
(3,41)
=====================2020-06-01 00:02:20
(9,49)
(8,59)
(7,73)
(5,41)
(3,66)
=====================2020-06-01 00:02:31