使用window函数实现时间段内数据累加
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object J_WindowOrderTotalStreaming {
val STREAMING_BATCH_INTERVAL = Seconds(1)
val STREAMING_WINDOW_INTERVAL = STREAMING_BATCH_INTERVAL * 3
val STREAMING_SLIDER_INTERVAL = STREAMING_BATCH_INTERVAL * 3
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[3]").
setAppName("NetworkWordCount")
val ssc = new StreamingContext(conf, STREAMING_BATCH_INTERVAL)
ssc.sparkContext.setLogLevel("WARN")
val kafkaParams: Map[String, String] = Map(
"metadata.broker.list"->
"bigdata-hpsk01.huadian.com:9092,bigdata-hpsk01.huadian.com:9093,bigdata-hpsk01.huadian.com:9094",
"auto.offset.reset"->"largest"
)
val topics: Set[String] = Set("orderTopic")
val lines: DStream[String] = KafkaUtils.createDirectStream[String, String, StringDecoder,StringDecoder](
ssc,
kafkaParams,
topics
).map(_._2)
val inputDStream = lines.window(STREAMING_WINDOW_INTERVAL,STREAMING_SLIDER_INTERVAL)
val orderDStream: DStream[(Int, Int)] = inputDStream.transform(rdd=>{
rdd.filter(line=>line.trim.length> 0 && line.trim.split(",").length==3)
.map(line=>
{
val split = line.split(",")
(split(1).toInt,1)
})
})
val orderCountDStream =orderDStream.reduceByKey( _ + _)
orderCountDStream.print()
ssc.start()
ssc.awaitTermination()
}
}
使用reduceByKeyAndWindow实现累加方法一:不需要设置检查点
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object K_WindowOrderTotalStreaming {
val STREAMING_BATCH_INTERVAL = Seconds(5)
val STREAMING_WINDOW_INTERVAL = STREAMING_BATCH_INTERVAL * 3
val STREAMING_SLIDER_INTERVAL = STREAMING_BATCH_INTERVAL * 2
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[3]")
.setAppName("J_WindowOrderTotalStreaming")
val ssc: StreamingContext = new StreamingContext(conf, STREAMING_BATCH_INTERVAL)
ssc.sparkContext.setLogLevel("WARN")
val kafkaParams: Map[String, String] = Map(
"metadata.broker.list"->"bigdata-hpsk01.huadian.com:9092,bigdata-hpsk01.huadian.com:9093,bigdata-hpsk01.huadian.com:9094",
"auto.offset.reset"->"largest"
)
val topics: Set[String] = Set("orderTopic")
val kafkaDStream: DStream[String] = KafkaUtils
.createDirectStream[String, String, StringDecoder,StringDecoder](
ssc,
kafkaParams,
topics
).map(_._2)
val orderDStream: DStream[(Int, Int)] = kafkaDStream.transform(rdd=>{
rdd
.filter(line => line.trim.length >0 && line.trim.split(",").length ==3)
.map(line =>{
val splits = line.split(",")
(splits(1).toInt,1)
})
})
val orderCountDStream = orderDStream.reduceByKeyAndWindow(
(v1:Int, v2:Int) => v1 + v2,
STREAMING_WINDOW_INTERVAL,
STREAMING_SLIDER_INTERVAL
)
orderCountDStream.print()
ssc.start()
ssc.awaitTermination()
}
}
使用reduceByKeyAndWindow实现累加方法二:设置检查点
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object L_TrendOrderTotalStreaming {
val CHECK_POINT_PATH = "file:///E:\\JavaWork\\20190811\\test93"
val STREAMING_BATCH_INTERVAL = Seconds(1)
val STREAMING_WINDOW_INTERVAL = STREAMING_BATCH_INTERVAL * 3
val STREAMING_SLIDER_INTERVAL = STREAMING_BATCH_INTERVAL * 3
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[3]").
setAppName("NetworkWordCount")
val ssc = new StreamingContext(conf, STREAMING_BATCH_INTERVAL)
ssc.sparkContext.setLogLevel("WARN")
ssc.checkpoint(CHECK_POINT_PATH)
val kafkaParams: Map[String, String] = Map(
"metadata.broker.list"->
"bigdata-hpsk01.huadian.com:9092,bigdata-hpsk01.huadian.com:9093,bigdata-hpsk01.huadian.com:9094",
"auto.offset.reset"->"largest"
)
val topics: Set[String] = Set("orderTopic")
val lines: DStream[String] = KafkaUtils.createDirectStream[String, String, StringDecoder,StringDecoder](
ssc,
kafkaParams,
topics
).map(_._2)
val orderDStream: DStream[(Int, Int)] = lines.transform(rdd=>{
rdd.filter(line=>line.trim.length> 0 && line.trim.split(",").length==3)
.map(line=>
{
val split = line.split(",")
(split(1).toInt,1)
})
})
val orderCountDStream = orderDStream.reduceByKeyAndWindow(
(v1:Int, v2:Int) => v1 + v2,
(v1:Int, v2:Int) => v1 - v2,
STREAMING_WINDOW_INTERVAL,
STREAMING_SLIDER_INTERVAL
)
orderCountDStream.print()
ssc.start()
ssc.awaitTermination()
}
}