Spark Streaming单词计数使用示例

最新推荐文章于 2023-10-16 13:09:40 发布

一直不懂

最新推荐文章于 2023-10-16 13:09:40 发布

阅读量489

点赞数

分类专栏： Spark学习笔记

本文链接：https://blog.csdn.net/shenchaohao12321/article/details/100086886

版权

Spark学习笔记专栏收录该内容

1 篇文章 0 订阅

订阅专栏

首先定义一个trait StreamingExample继承于App，设置日志级别为WARN，定义几个常量子类使用。

trait StreamingExample extends App {

  Logger.getRootLogger.setLevel(Level.WARN)

  val hostname = "localhost"
  val port = 9999
  val checkpointDir = "spark_streaming/checkpoint"
  val outputDir = "spark_streaming/output"
  val inputDir = "spark_streaming/input"

}

1、从HDFS消费数据

object HdfsWordCount extends StreamingExample {

  val sparkConf = new SparkConf().setAppName("HdfsWordCount")
  val ssc = new StreamingContext(sparkConf, Seconds(2))

  val lines = ssc.textFileStream(inputDir)
  val words = lines.flatMap(_.split(" "))
  val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
  wordCounts.print()

  ssc.start()
  ssc.awaitTermination()

}

2、从指定TCP端口消费数据

/**
  * $ nc -lk 9999
  * 功能：从SocketInputDStream消费数据（指定IP和端口)
  **/
object NetworkWordCount extends StreamingExample {
  //相当于启动两个线程，一个给receiver，一个给computer，一个序列化
  val sparkConf = new SparkConf().setMaster("local[3]").setAppName("NetworkWordCount")
  val ssc = new StreamingContext(sparkConf, Seconds(5))
  val lines = ssc.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_SER)

  val words = lines.flatMap(_.split(" "))

  val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)

  wordCounts.print()

  ssc.start()
  ssc.awaitTermination()
}

3、进程内消费数据

/**
  * 功能：从QueueInputDStream消费数据
  */
object QueueStream extends StreamingExample {

  //1、设置参数
  val sparkConf = new SparkConf().setMaster("local[3]").setAppName("QueueStream")

  //2、创建StreamingContext
  val ssc = new StreamingContext(sparkConf, Seconds(2))
  val rddQueue = new mutable.Queue[RDD[Int]]()

  //3、指定输入
  val inputStream = ssc.queueStream(rddQueue)

  //4、构造工作流(DAG)
  val mappedStream = inputStream.map(x => (x % 10, 1))
  val reducedStream = mappedStream.reduceByKey(_ + _)
  reducedStream.print()

  //5、启动应用程序
  ssc.start()


  //构造rddQueue
  for (i <- 1 to 30) {
    rddQueue.synchronized {
      rddQueue += ssc.sparkContext.makeRDD(1 to 1000, 10)
    }
    Thread.sleep(1000)
  }

  //6、关闭应用程序
  ssc.stop()
}

4、从Kafka消费数据

object DirectKafkaWordCount extends StreamingExample {

  val Array(brokers, groupId, topics) = args

  // Create context with 2 second batch interval
  val sparkConf = new SparkConf().setAppName("DirectKafkaWordCount")
  val ssc = new StreamingContext(sparkConf, Seconds(2))

  // Create direct kafka stream with brokers and topics
  val topicsSet = topics.split(",").toSet
  val kafkaParams = Map[String, Object](
    ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
    ConsumerConfig.GROUP_ID_CONFIG -> groupId,
    ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
    ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer])
  val messages = KafkaUtils.createDirectStream[String, String](
    ssc,
    //三种调度consumer的策略(分区分配)：1、PreferConsistent 2、PreferBrokers 3、PreferFixed
    LocationStrategies.PreferConsistent,

    //三种消费策略：1、Subscribe：指定topic 2、SubscribePattern:通过正则来匹配topic(动态新增要消费的topic) 3、Assign:指定topic和partition
    ConsumerStrategies.Subscribe[String, String](topicsSet, kafkaParams))

  // Get the lines, split them into words, count the words and print
  val lines = messages.map(_.value)
  val words = lines.flatMap(_.split(" "))
  val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
  wordCounts.print()

  // Start the computation
  ssc.start()
  ssc.awaitTermination()
}

5、自定义Receiver

object CustomReceiver extends StreamingExample {

  val sparkConf = new SparkConf().setAppName("CustomReceiver")
  val ssc = new StreamingContext(sparkConf, Seconds(1))

  val lines = ssc.receiverStream(new CustomReceiver(hostname, port))

  val words = lines.flatMap(_.split(" "))
  val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
  wordCounts.print()
  ssc.start()
  ssc.awaitTermination()

}

class CustomReceiver(host: String, port: Int)
  extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) {

  def onStart() {
    new Thread("Socket Receiver") {
      override def run() {
        receive()
      }
    }.start()
  }

  def onStop() {
    //todo
  }

  /** Create a socket connection and receive data until receiver is stopped */
  private def receive() {
    var socket: Socket = null
    var userInput: String = null
    try {
      println(s"Connecting to $host : $port")
      socket = new Socket(host, port)
      println(s"Connected to $host : $port")
      val reader = new BufferedReader(
        new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8))
      userInput = reader.readLine()
      while (!isStopped && userInput != null) {
        store(userInput)
        userInput = reader.readLine()
      }
      reader.close()
      socket.close()
      println("Stopped receiving")
      restart("Trying to connect again")
    } catch {
      case e: java.net.ConnectException =>
        restart(s"Error connecting to $host : $port", e)
      case t: Throwable =>
        restart("Error receiving data", t)
    }
  }
}

6、通过Spark SQL计算WordCount

object SqlNetworkWordCount extends StreamingExample {

  val sparkConf = new SparkConf().setMaster("local[3]").setAppName("SqlNetworkWordCount")
  val ssc = new StreamingContext(sparkConf, Seconds(2))
  val lines = ssc.socketTextStream(hostname, port, StorageLevel.MEMORY_AND_DISK_SER)
  val words = lines.flatMap(_.split(" "))

  // Convert RDDs of the words DStream to DataFrame and run SQL query
  words.foreachRDD { (rdd: RDD[String], time: Time) =>

    // 获取SparkSession
    val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)

    import spark.implicits._

    // RDD[String] -> RDD[case class] -> DataFrame
    val wordsDataFrame = rdd.map(w => Record(w)).toDF()

    // 创建临时视图：words
    wordsDataFrame.createOrReplaceTempView("words")

    // 执行SQL
    val wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word")

    // 打印执行结果
    println(s"========= $time =========")
    wordCountsDataFrame.show()
  }

  ssc.start()
  ssc.awaitTermination()
}

case class Record(word: String)

object SparkSessionSingleton {

  @transient private var instance: SparkSession = _

  def getInstance(sparkConf: SparkConf): SparkSession = {
    if (instance == null) {
      instance = SparkSession
        .builder
        .config(sparkConf)
        .getOrCreate()
    }
    instance
  }
}

7、使用mapWithState累加每个RDD的单词数

/**
  * 功能：从SocketInputDStream消费数据，计算累计WordCount
  **/
object StatefulNetworkWordCount extends StreamingExample {

  val sparkConf = new SparkConf().setMaster("local[4]").setAppName("StatefulNetworkWordCount")
  val ssc = new StreamingContext(sparkConf, Seconds(1))
  ssc.checkpoint("checkpoint")

  // Initial state RDD for mapWithState operation
  val initialRDD = ssc.sparkContext.parallelize(List(("hello", 1), ("world", 1)))

  val lines = ssc.socketTextStream(hostname, port)
  val words = lines.flatMap(_.split(" "))
  val wordDstream = words.map(x => (x, 1))

  // 计算Word的累计数据，通过State保留历史值
  val mappingFunc = (word: String, one: Option[Int], state: State[Int]) => {
    val sum = one.getOrElse(0) + state.getOption.getOrElse(0)
    val output = (word, sum)
    state.update(sum)
    output
  }

  val stateDstream = wordDstream.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD))
  stateDstream.print()
  ssc.start()
  ssc.awaitTermination()
}

8、从checkpoint中重建应用程序

/**
  * 功能：演示如何从checkpoint中重建应用程序及broadcast和accumulator的用法
  **/
object RecoverableNetworkWordCount extends StreamingExample {

  // 如果设置了checkpoint，则从checkpoint中恢复应用程序；否则，重新创建应用程序
  val ssc = StreamingContext.getOrCreate(checkpointDir,
    () => createContext(hostname, port, outputDir, checkpointDir))
  ssc.start()
  ssc.awaitTermination()

  def createContext(ip: String, port: Int, outputPath: String, checkpointDirectory: String): StreamingContext = {
    println("Creating new context .....")

    //创建输出文件夹
    val outputFile = new File(outputPath)
    if (outputFile.exists()) outputFile.delete()

    val sparkConf = new SparkConf().setMaster("local[2]").setAppName("RecoverableNetworkWordCount")
    val ssc = new StreamingContext(sparkConf, Seconds(3))
    ssc.checkpoint(checkpointDirectory)

    val lines = ssc.socketTextStream(ip, port)
    val words = lines.flatMap(_.split(" "))
    val wordCounts = words.map((_, 1)).reduceByKey(_ + _)


    wordCounts.foreachRDD { (rdd: RDD[(String, Int)], time: Time) =>

      // 获取黑名单
      val blacklist = WordBlacklist.getInstance(rdd.sparkContext)

      // 黑名单计数器
      val droppedWordsCounter = DroppedWordsCounter.getInstance(rdd.sparkContext)

      // 排除黑名单单词
      val counts = rdd.filter { case (word, count) =>
        if (blacklist.value.contains(word)) {
          droppedWordsCounter.add(count)
          false
        } else {
          true
        }
      }.collect().mkString("[", ", ", "]")

      // 输出过滤后的word
      val output = s"Counts at time $time $counts"
      println(output)
      println(s"Dropped ${droppedWordsCounter.value} word(s) totally")
      println(s"Appending to ${outputFile.getAbsolutePath}")
      Files.append(output + "\n", outputFile, Charset.defaultCharset())
    }

    ssc
  }
}


object WordBlacklist {

  @volatile private var instance: Broadcast[Seq[String]] = null

  def getInstance(sc: SparkContext): Broadcast[Seq[String]] = {
    if (instance == null) {
      synchronized {
        if (instance == null) {
          val wordBlacklist = Seq("a", "b", "c")
          instance = sc.broadcast(wordBlacklist)
        }
      }
    }
    instance
  }
}

object DroppedWordsCounter {

  @volatile private var instance: LongAccumulator = _

  def getInstance(sc: SparkContext): LongAccumulator = {
    if (instance == null) {
      synchronized {
        if (instance == null) {
          instance = sc.longAccumulator("WordsInBlacklistCounter")
        }
      }
    }
    instance
  }
}