package kafka import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.streaming._ import org.apache.spark.streaming.StreamingContext._ import org.apache.spark.streaming.dstream.DStream import org.apache.spark.util.LongAccumulator // not necessary since Spark 1.3 object sparkStreaming { @volatile private var WordBlacklist : Broadcast[Seq[String]] = null def getWordBlacklist (sc: SparkContext): Broadcast[Seq[String]] = { if (WordBlacklist == null) { synchronized { if (WordBlacklist == null) { val wordBlacklist = Seq("a", "b", "c") WordBlacklist = sc.broadcast(wordBlacklist) } } } WordBlacklist } @volatile private var DroppedWordsCounter : LongAccumulator = null def getDroppedWordsCounter (sc: SparkContext): LongAccumulator = { if (DroppedWordsCounter == null) { synchronized { if (DroppedWordsCounter == null) { DroppedWordsCounter = sc.longAccumulator("WordsInBlacklistCounter") } } } DroppedWordsCounter } def main(args: Array[String]): Unit = { // val conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount") // val ssc = new StreamingContext(conf, Seconds(1)) val conf = new SparkConf().setMaster("local[2]").setAppName("sparkStreaming") val ssc = new StreamingContext(conf, Seconds(1)) val lines = ssc.socketTextStream("localhost", 9999) // Split each line into words val words = lines.flatMap(_.split(" ")) // val words: DStream[String] = ... words.foreachRDD { rdd => // Get the singleton instance of SparkSession val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate() import spark.implicits._ // Convert RDD[String] to DataFrame val wordsDataFrame = rdd.toDF("word") // Create a temporary view wordsDataFrame.createOrReplaceTempView("words") // Do word count on DataFrame using SQL and print it val wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word") wordCountsDataFrame.show() } import org.apache.spark.streaming.StreamingContext._ // Count each word in each batch val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _) // Print the first ten elements of each RDD generated in this DStream to the console wordCounts.print() wordCounts.foreachRDD { (rdd: RDD[(String, Int)], time: Time) => // Get or register the blacklist Broadcast val blacklist = sparkStreaming.getWordBlacklist(rdd.sparkContext) // Get or register the droppedWordsCounter Accumulator val droppedWordsCounter = sparkStreaming.getDroppedWordsCounter(rdd.sparkContext) // Use blacklist to drop words and use droppedWordsCounter to count them val counts = rdd.filter { case (word, count) => if (blacklist.value.contains(word)) { droppedWordsCounter.add(count) false } else { true } }.collect().mkString("[", ", ", "]") val output = "Counts at time " + time + " " + counts } ssc.start() // Start the computation ssc.awaitTermination() // Wait for the computation to terminate } }
sparkStreaming
最新推荐文章于 2024-06-23 23:34:15 发布