1、spark streaming direct方式读取kafka性能要好很多,缺点是它不会去zookeeper更新offset,这将导致
基于zk的KafkaOffsetMonitor监控失效。由于我们流处理的数据量大并且健壮性要求高,我们需要
通过KafkaOffsetMonitor 来实时监控数据消费情况,故自己实现offset推送到zk,如下:
2、spark代码:
- object KSConvertStreaming{
-
- val savaLocal = "/xxx/parquet/%s/year=%s/month=%s/day=%s"
-
- def jsonConvert(jsonStrs: Iterator[String]): Iterator[(String, ArrayBuffer[String])] = {
- StreamingUtils.init
- val typeMap = scala.collection.mutable.Map[String, ArrayBuffer[String]]()
- jsonStrs.foreach(
- x => {
- val res = StreamingUtils.mapToStr(x)
- if (null != res) {
- val msgType = res.get(0)
- if (!typeMap.contains(msgType)) {
- typeMap += (msgType -> new ArrayBuffer[String]())
- }
- typeMap(msgType) += res.get(1)
- }
- }
- )
- typeMap.iterator
- }
-
- def main(args: Array[String]): Unit = {
-
- val Array(maxPartition, maxNumber,windownsS,groupName,maxRatePerPartition) = args
- val topicName = "xxx"
- val kafkaAddr = "xxx:9092,xxx:9092,xxx:9092"
- val kafkaParams = Map[String, String](
- "metadata.broker.list" -> kafkaAddr,
- "group.id" -> groupName,
- "auto.offset.reset" -> "largest"
- )
- val topics = Set(topicName)
-
- println(s"maxPartition -------- $maxPartition")
- println(s"maxNumber -------- $maxNumber")
- println(s"windownsS -------- $windownsS")
- println(s"groupName -------- $groupName")
-
- val sparkConf = new SparkConf().setAppName("Streaming_Convert")
- .set("spark.yarn.executor.memoryOverhead","1024")
- .set("spark.streaming.kafka.maxRatePerPartition",maxRatePerPartition) //此处为每秒每个partition的条数
- .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
- .set("spark.reducer.maxSizeInFlight", "1m")
- val sc = new SparkContext(sparkConf)
- val sqlContext = new org.apache.spark.sql.SQLContext(sc)
- val ssc = new StreamingContext(sc, Seconds(windownsS.toInt)) //秒单位
-
- val topicDirs = new ZKGroupTopicDirs(groupName,topicName)
-
- val zkClient = new ZkClient("xxx:2181,xxx:2181,xxx:2181",Integer.MAX_VALUE,100000,ZKStringSerializer)
- val children = zkClient.countChildren(topicDirs.consumerOffsetDir)
-
- var kafkaStream: InputDStream[(String, String)] = null
- var fromOffsets: Map[TopicAndPartition, Long] = Map()
- val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
- if (children > 0) {
- for (i <- 0 until children) {
- val partitionOffset = zkClient.readData[String](s"${topicDirs.consumerOffsetDir}/$i")
- val tp = TopicAndPartition(topicName, i)
- fromOffsets += (tp -> partitionOffset.toLong)
- }
- kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
- }
- else {
- kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
- }
-
- var offsetRanges = Array[OffsetRange]()
-
- kafkaStream.transform { rdd =>
- offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
- rdd
- }.map(_._2).foreachRDD {
- rdd =>
- val xRDD = rdd.flatMap(_.split("\n")).mapPartitions(x => jsonConvert(x))
- xRDD.persist(StorageLevel.MEMORY_ONLY)
- val typeCountMap = xRDD.map(x => (x._1, x._2.size)).collect()
- val typeMap = scala.collection.mutable.Map[String, Long]()
- typeCountMap.foreach(x => {
- if (!typeMap.contains(x._1)) {
- typeMap(x._1) = x._2
- } else {
- val tmpCount = typeMap(x._1)
- typeMap(x._1) = x._2 + tmpCount
- }
- })
- var totalCount: Long = 0
- typeMap.foreach(x => {
- println(s"${x._1}:${x._2}")
- totalCount += x._2
- })
- println(s"total : $totalCount" )
- val sortedMap = collection.mutable.LinkedHashMap(typeMap.toSeq.sortWith(_._2 > _._2): _*)
-
- sortedMap.foreach {
- x => {
- val pointType = x._1
- val count = x._2
- println(s"save type( $pointType ) count( $count )")
- val jsonRDD = xRDD.filter(x => x._1 == pointType).
- flatMap(x => x._2)
- var partitionNum = count / maxNumber.toLong
- if (partitionNum == 0) {
- partitionNum = 1
- }
- if (partitionNum > maxPartition.toLong) {
- partitionNum = maxPartition.toLong
- }
-
- println(s"\trepartition ( $partitionNum )")
- val arrType = pointType.split('-')
- sqlContext.read.json(jsonRDD).
- repartition(partitionNum.toInt).
- write.mode(SaveMode.Append).
- parquet(savaLocal.format(arrType(0),arrType(1),arrType(2),arrType(3)))
-
- }
- }
- xRDD.unpersist()
-
- for (o <- offsetRanges) {
- val zkPath = s"${topicDirs.consumerOffsetDir}/${o.partition}"
- ZkUtils.updatePersistentPath(zkClient, zkPath, o.untilOffset.toString)
- }
- }
-
- ssc.start()
- ssc.awaitTermination()
- }
-
-
- }
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29754888/viewspace-2125804/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/29754888/viewspace-2125804/