sparkStreaming与kafka集成进行数据处理

最新推荐文章于 2023-01-30 16:19:05 发布

曾一航

最新推荐文章于 2023-01-30 16:19:05 发布

阅读量258

点赞数

本文链接：https://blog.csdn.net/weixin_42744795/article/details/82083569

版权

集成分为两个版本：http://spark.apache.org/docs/2.2.0/streaming-kafka-integration.html

下例是0.8版本的例子：https://github.com/apache/spark/blob/v2.2.0/examples/src/main/scala/org/apache/spark/examples/streaming/DirectKafkaWordCount.scala

import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

object streamingKafka {
  def main(args: Array[String]): Unit = {
    val sparkConf=new SparkConf()
      .setMaster("local[2]")
      .setAppName("streamingKafka")

    val ssc=new StreamingContext(sparkConf,Seconds(5))


    val topicsSet = Set("weblogs")
    val kafkaParams = Map[String, String]("metadata.broker.list" -> "zeng151.com:9092")
    val kfkStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
      ssc, kafkaParams, topicsSet)

    // Get the lines, split them into words, count the words and print
    val lines = kfkStream.map(_._2)
    val words = lines.flatMap(_.split(" "))
    val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
    wordCounts.print()

    ssc.start()
    ssc.awaitTermination()
  }
}