Spark Streaming累加器与广播的简单应用

package spark

/**
 *监控网络中的数据,基于broadcast中的黑名单,对获取数据流中的单词进行过滤筛选,并统计过滤筛选出记录的个数
 */

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.Accumulator
/**
 * 定义单例对象WordBlacklist,用来注册广播变量。
 */
object WordBlacklist {

  @volatile private var instance: Broadcast[Seq[String]] = null

  def getInstance(sc: SparkContext): Broadcast[Seq[String]] = {
    if (instance == null) {
      synchronized {   //同步
        if (instance == null) {
          val wordBlacklist = Seq("hello", "world") //黑名单的词
          instance = sc.broadcast(wordBlacklist) //广播变量,一旦创建,发送给所有子节点
        }
      }
    }
    instance
  }
}

/**
 * 定义单例对象DroppedWordsCounter,用来注册累加器。
 */
object DroppedWordsCounter {

  @volatile private var instance: Accumulator[Int] = null

  def getInstance(sc: SparkContext): Accumulator[Int] = {
    if (instance == null) {  //如果累加器不存在则创建
      synchronized {
        if (instance == null) {
          instance = sc.accumulator(0) //用以记录被删除的词
        }
      }
    }
    instance
  }
}

object Streaming_AccumulatorAndBroadcast {

  //1.封装业务逻辑;2.创建对象
  def createContext(ip: String, port: Int, outputPath: String, checkpointDirectory: String) //接收四个参数
  : StreamingContext = {

    // 如果没有打印出"Creating new context"说明StreamingContext已经从checkpoint目录中创建
    println("Creating new context")


    // 创建ssc,设定批生成时间间隔为5s
    val sparkConf = new SparkConf().setAppName("Streaming_AccumulatorAndBroadcast").setMaster("local[2]")
    val ssc = new StreamingContext(sparkConf, Seconds(5))
    ssc.checkpoint(checkpointDirectory)

    // 创建基于ip:port的Socket数据流,数据流中word以'\n'进行切分
    val lines = ssc.socketTextStream(ip, port)
    val words = lines.flatMap(_.split(","))
    val wordCounts = words.map((_, 1)).reduceByKey(_ + _)  //转换为二元组进行累加操作
    //eg: wordCounts = {RDD1={(hell0,2),(spark,1)},RDD2={(world,1),(spark,3)}...}

    wordCounts.foreachRDD { rdd=>

      // 获取或注册黑名单广播变量blacklist Broadcast  Seq("hello", "world")
      val blacklist = WordBlacklist.getInstance(rdd.sparkContext)//rdd.sparkContext返回当前rdd中封装的SparkContext实例

      // 获取或注册删除单词累加器WordsCounter Accumulator  0
      val droppedWordsCounter = DroppedWordsCounter.getInstance(rdd.sparkContext)//rdd.sparkContext返回当前rdd中封装的SparkContext实例

      // 使用黑名单blacklist过滤单词,并利用droppedWordsCounter对过滤掉的单词进行统计,并输出
      //      val counts = rdd.filter { case (word, count) =>
      val filteredrdd=rdd.filter { case (word, count) =>
        if (blacklist.value.contains(word)) {   //判断是否包含"hello"或"world"
          droppedWordsCounter.add(count.toInt)
          println("the word: "+word+" is deleted "+count+" times")
          false  //返回false值
        }
        else {
          true
        }

      }
      filteredrdd.saveAsTextFile(outputPath)  //保存结果
      filteredrdd.foreach(println)  //输出
      println("the accumulator is "+droppedWordsCounter+"!!!!!!!!!!!!!") //返回累加器的总和

    }
    ssc  //返回ssc
  }

  def main(args: Array[String]) {


    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)

    if (args.length != 4) {
      System.err.println("Your arguments were " + args.mkString("[", ", ", "]"))

      System.exit(1)
    }

    //需要事先创建相应目录!!
    val Array(ip, port, outputPath, checkpointDirectory) = args
    val ssc = StreamingContext.getOrCreate(checkpointDirectory,
      () => createContext(ip, port.toInt,outputPath,checkpointDirectory))
    ssc.start()
    ssc.awaitTermination()
  }
}

程序运行时需手动在IDE上配置四个运行参数:ip、端口、输出路径和checkpoint路径,中间以空格隔开。
这里写图片描述

还是通过早前博文上的模拟器及源文件,在9999端口发送数据。

这里写图片描述

运行主程序时,模拟器会连接上并开始发送数据,主程序监听并处理数据,输出
这里写图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值