Sparkstreaming连接kafka进行wordcount

依赖:

<dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
      <version>2.2.0</version>
</dependency>
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent

object SparkSteamingLogAnalysis {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("SparkSteamingLogAnalysis")
    //    val tt = args(0).trim.toLong

    val ssc = new StreamingContext(conf, Seconds(5))
    //    val sc = ssc.sparkContext
    //查询历史暂存路径
    ssc.checkpoint("G:\\WordCount")

    val sheData: InputDStream[ConsumerRecord[String, String]] = getKafka(ssc, "kafka_spark_redis", "001")

    val updateFunc = (curVal: Seq[Int], preVal: Option[Int]) => {
      //进行数据统计当前值加上之前的值
      var total = curVal.sum
      //最初的值应该是0
      var previous = preVal.getOrElse(0)
      //Some 代表最终的返回值
      Some(total + previous)
    }
    //获取kafka中的数据
    //处理数据
    sheData.foreachRDD { rdds => {

      val offsetRanges: Array[OffsetRange] = rdds.asInstanceOf[HasOffsetRanges].offsetRanges
      //        print(offsetRanges.length + ", " + offsetRanges.toBuffer)
      //统计结果
      //        val result = offsetRanges(3).ma·p(_._2).flatMap(_.split(" ")).map(word=>(word,1)).updateStateByKey(updateFunc).print()
      // 方法(rdds)
      //         print(rdds.collect().toBuffer)
      sheData.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
    }
    }
    val result = sheData.map(_.value()).flatMap(_.split(" ")).map(word=>(word,1)).updateStateByKey(updateFunc).print()

    ssc.start()
    ssc.awaitTermination()
  }


  /**
    * 获取kafka配置信息
    */
  def getKafka(ssc: StreamingContext, topic: String, groupId: String) = {
    val kafkaParams = Map[String, Object](

      "bootstrap.servers" -> "hdp-1:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupId,
      "auto.offset.reset" -> "latest",
      "fetch.max.wait.ms" -> Integer.valueOf(500),
      "enable.auto.commit" -> java.lang.Boolean.valueOf(false)
    )
    val topics = Array(topic)
    val data = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent,
      Subscribe[String, String](topics, kafkaParams))
    data
  }
}

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值