本文主要通过spark streaming实现基于热门搜索词的统计
import org.apache.spark.SparkConf import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.Seconds /** * @author jhp * 统计热门搜索词 */ object WindowHotWord { //数据格式: 手机 def main(args: Array[String]): Unit = { val conf = new SparkConf() .setMaster("local[2]") .setAppName("WindowHotWord") val ssc = new StreamingContext(conf, Seconds(1)) val searchLogsDStream = ssc.socketTextStream("spark1", 9999) val searchWordsDStream = searchLogsDStream.map { _.split(" ")(1) } val searchWordPairsDStream = searchWordsDStream.map { searchWord => (searchWord, 1) } //使用reduceByKeyAndWindow val searchWordCountsDSteram = searchWordPairsDStream.reduceByKeyAndWindow( (v1: Int, v2: Int) => v1 + v2, Seconds(60), Seconds(10)) //转换为RDD val finalDStream = searchWordCountsDSteram.transform(searchWordCountsRDD => { val countSearchWordsRDD = searchWordCountsRDD.map(tuple => (tuple._2, tuple._1)) val sortedCountSearchWordsRDD = countSearchWordsRDD.sortByKey(false) val sortedSearchWordCountsRDD = sortedCountSearchWordsRDD.map(tuple => (tuple._1, tuple._2)) val top3SearchWordCounts = sortedSearchWordCountsRDD.take(3) for(tuple <- top3SearchWordCounts) { println(tuple) } searchWordCountsRDD }) finalDStream.print() //启动 ssc.start() ssc.awaitTermination() } }