第一部分对sparkstreaming向kafka写数据和读取数据进行简单的测试和开发,本部分主要是从kafka消费数据的时候,防止意外情况sparkstreaming程序终止运行,导致数据丢失情况发生,需要对kafka的offset 进行记录,在这里我用的是直接读取kafka的方式(createDirectStream),没有经过zookeep,所以这个读取的偏移量需要自己去维护。
- 消费者代码
package com.baofeng.dataparse import org.apache.spark.{SparkConf, TaskContext} import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.Seconds import kafka.serializer.StringDecoder import org.apache.spark.rdd.RDD import org.apache.spark.streaming.kafka.HasOffsetRanges import org.apache.spark.streaming.kafka.OffsetRange import org.apache.spark.streaming.kafka.KafkaManager import spray.json._ object Comsumer { def main(args: Array[String]): Unit = { println("Comsumer") val conf = new SparkConf().setMaster("local[2]").setAppName("ReadAndSave") val ssc = new StreamingContext(conf, Seconds(5)) val topics = Set("user_msg","mytopic") val brokers = "192.168.201.117:9092" val kafkaParams = Map[String, String]( "metadata.broker.list" -> brokers, "serializer.class" -> "kafka.serializer.StringEncoder", "group.id" -> "group_stream_id", "auto.offset.reset" -> "largest") val km = new KafkaManager(kafkaParams) //封装createDirectStream方法,读取其中的当前offset val kafkaStream = km.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics) var offsetRanges = Array[OffsetRange]() kafkaStream.transform(rdd =>{ offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges rdd }).foreachRDD { rdd => for (o <- offsetRanges) { println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}") } //多个主题,每个主题存在不同的日志,需要不同的分析方法,在logParser中实现 rdd.foreach(r=>{ val offsetRange: OffsetRange = offsetRanges(TaskContext.get.partitionId) val obj = LogParser.getObject(offsetRange.topic) if(obj == null) { println(offsetRange.topic+" error ,not found") }else{ obj.deal(r) } }) } //更新回offset km.updateZKOffsetsFromoffsetRanges(offsetRanges, 1) ssc.start() ssc.awaitTermination() } }
其中有个疑惑的地方,从kafka中读取的消息中是没有topic信息,需要自己通过以下代码进行实现通过
offsetRanges(TaskContext.get.partitionId) 获得当前的topic信息
kafkaStream.transform(rdd =>{ //根本官方的文档必须在此运行 offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges rdd }).foreachRDD { rdd => for (o <- offsetRanges) { println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}") } rdd.foreach(r=>{ // val offsetRange: OffsetRange = offsetRanges(TaskContext.get.partitionId) println(offsetRange.topic) })
hashOffsetRanges必须在第一个方法中调用,由于以后的一系列的在RDD和kafkaRDD的map操作导致hasOffsetRanges信息丢失。 -
业务分析代码
package com.baofeng.dataparse import spray.json._ trait LogParser { def deal(record:Tuple2[String,String]) } object LogParser { val objList:Map[String,LogParser] = Map("mytopic"->new CmsLog,"user"->new UserLog) def getObject(name:String):LogParser= { return objList.get(name).getOrElse(null) } } class CmsLog() extends LogParser { val name:String = "mytopic" override def deal(record: Tuple2[String,String]): Unit = { val r=record._2 val data = r.split(" ") println(r) } } class UserLog() extends LogParser { val name:String = "user" override def deal(record: Tuple2[String,String]): Unit = { val data = JsonParser(record._2).asJsObject() println(data.getFields("userid")+" "+data.getFields("access")) } }
KafkaManager的实现,网上拷贝过来的代码
package org.apache.spark.streaming.kafka import kafka.common.TopicAndPartition import kafka.message.MessageAndMetadata import kafka.serializer.Decoder import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.StreamingContext class KafkaManager(val kafkaParams: Map[String, String]) extends Serializable { private val kc = new KafkaCluster(kafkaParams) private val flag = 1150 * 10000l def createDirectStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](ssc: StreamingContext, kafkaParams: Map[String, String], topics: Set[String]): InputDStream[(K, V)] = { val groupId = kafkaParams.get("group.id").get // 在zookeeper上读取offsets前先根据实际情况更新offsets setOrUpdateOffsets(topics, groupId) //从zookeeper上读取offset开始消费message val messages = { val partitionsE = kc.getPartitions(topics) if (partitionsE.isLeft) throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}") val partitions = partitionsE.right.get val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions) if (consumerOffsetsE.isLeft) throw new SparkException(s"get kafka consumer offsets failed: ${consumerOffsetsE.left.get}") val consumerOffsets = consumerOffsetsE.right.get KafkaUtils.createDirectStream[K, V, KD, VD, (K, V)]( ssc, kafkaParams, consumerOffsets, (mmd: MessageAndMetadata[K, V]) => (mmd.key, mmd.message)) } messages } /** * 创建数据流前,根据实际消费情况更新消费offsets * @param topics * @param groupId */ private def setOrUpdateOffsets(topics: Set[String], groupId: String): Unit = { topics.foreach(topic => { var hasConsumed = true val partitionsE = kc.getPartitions(Set(topic)) if (partitionsE.isLeft) throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}") val partitions = partitionsE.right.get val consumerOffsetsE = kc.getConsumerOffsets(groupId, partitions) if (consumerOffsetsE.isLeft) hasConsumed = false if (hasConsumed) { val earliestLeaderOffsetsE = kc.getEarliestLeaderOffsets(partitions) if (earliestLeaderOffsetsE.isLeft) throw new SparkException(s"get earliest leader offsets failed: ${earliestLeaderOffsetsE.left.get}") val earliestLeaderOffsets = earliestLeaderOffsetsE.right.get val consumerOffsets = consumerOffsetsE.right.get // 可能只是存在部分分区consumerOffsets过时,所以只更新过时分区的consumerOffsets为earliestLeaderOffsets var offsets: Map[TopicAndPartition, Long] = Map() consumerOffsets.foreach({ case (tp, n) => val earliestLeaderOffset = earliestLeaderOffsets(tp).offset if (n < earliestLeaderOffset) { println("consumer group:" + groupId + ",topic:" + tp.topic + ",partition:" + tp.partition + " offsets已经过时,更新为" + earliestLeaderOffset) offsets += (tp -> earliestLeaderOffset) } }) if (!offsets.isEmpty) { kc.setConsumerOffsets(groupId, offsets) } } else { // 没有消费过 val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase) var leaderOffsets: Map[TopicAndPartition, LeaderOffset] = null if (reset == Some("smallest")) { val leaderOffsetsE = kc.getEarliestLeaderOffsets(partitions) if (leaderOffsetsE.isLeft) throw new SparkException(s"get earliest leader offsets failed: ${leaderOffsetsE.left.get}") leaderOffsets = leaderOffsetsE.right.get } else { val leaderOffsetsE = kc.getLatestLeaderOffsets(partitions) if (leaderOffsetsE.isLeft) throw new SparkException(s"get latest leader offsets failed: ${leaderOffsetsE.left.get}") leaderOffsets = leaderOffsetsE.right.get } val offsets = leaderOffsets.map { case (tp, offset) => (tp, offset.offset) } kc.setConsumerOffsets(groupId, offsets) } }) } /** * 更新zookeeper上的消费offsets * 把当前的消费记录,写入zk * * @param rdd */ def updateZKOffsets(rdd: RDD[(String, String)]): Unit = { val groupId = kafkaParams.get("group.id").get val offsetsList = rdd.asInstanceOf[HasOffsetRanges].offsetRanges for (offsets <- offsetsList) { val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition) val o = kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsets.untilOffset))) if (o.isLeft) { println(s"Error updating the offset to Kafka cluster: ${o.left.get}") } } } /** * 更新zookeeper上的消费offsets * 把当前的消费记录的offset往前推 * 并写入zk * * @param rdd * @param day */ def updateZKOffsetsFromoffsetRanges(offsetRanges: Array[OffsetRange], day: Double): Unit = { val groupId = kafkaParams.get("group.id").get for (offsets <- offsetRanges) { val topicAndPartition = TopicAndPartition(offsets.topic, offsets.partition) var offsetStreaming = 0l println("offsets.untilOffset " + offsets.untilOffset) if (offsets.untilOffset >= flag) { offsetStreaming = offsets.untilOffset - (flag * day).toLong } else { offsetStreaming = 0 } println("offsetStreaming " + offsetStreaming) val o = kc.setConsumerOffsets(groupId, Map((topicAndPartition, offsetStreaming))) if (o.isLeft) { println(s"Error updating the offset to Kafka cluster: ${o.left.get}") } } } }