package SparkStreamingTest.Scala
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by TG.
* 每隔2秒钟,统计最近5秒钟的搜索词中排名最靠前的3个搜索词以及出现次数。
*/
object ReduceByKeyAndWindowDemo {
def main(args: Array[String]): Unit = {
//设置日志级别
Logger.getLogger("org").setLevel(Level.WARN)
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[2]")
val ssc = new StreamingContext(conf, Seconds(1))
//StorageLevel.MEMORY_AND_DISK_SER_2
val linesDStream = ssc.socketTextStream("master", 6666)
//StorageLevel.MEMORY_ONLY_SER
// linesDStream.persist()
linesDStream.checkpoint(Seconds(10))
linesDStream.flatMap(_.split(" "))
.map((_, 1))
.reduceByKeyAndWindow((a: Int, b: Int) => a + b, Seconds(5), Seconds(2))
.transform(rdd => {
val result: Array[(String, Int)] = rdd.map(x => (x._2, x._1)).sortByKey(false).map(x => (x._2, x._1)).take(3)
//result的类型不是RDD,而是一个Array数组,此处将其变为RDD
val resultRDD = ssc.sparkContext.parallelize(result)
//注意:transform函数是要有返回值的,所以将操作之后的resultRDD返回。
resultRDD
}).map(x => x._1 + "出现的次数是:" + x._2).print()
ssc.start()
ssc.awaitTermination()
}
}
Spark Streaming中reduceByKeyAndWindow实例开发
最新推荐文章于 2024-07-25 09:26:43 发布