写了两套方法,一套保存在zk上,一套保存在mysql里面
/**
* @author wade
* @create 2019-03-14 9:16
*/
object LowerKafkaSource {
def fromOffsets(kafkaCluster: KafkaCluster, groupId: String, topic: String): Map[TopicAndPartition, Long] = {
var partitionToLong: Map[TopicAndPartition, Long] = Map[TopicAndPartition, Long]()
//从集群中获取主题的所有分区
val topicAndPartitionEither: Either[Err, Set[TopicAndPartition]] = kafkaCluster.getPartitions(Set(topic))
if (topicAndPartitionEither.isRight) {
val partitions: Set[TopicAndPartition] = topicAndPartitionEither.right.get
//获取分区 对应组的offset
val topicAndPartitionToLongEither: Either[Err, Map[TopicAndPartition, Long]] = kafkaCluster.getConsumerOffsets(groupId,partitions)
val earlist: Either[Err, Map[TopicAndPartition, KafkaCluster.LeaderOffset]] = kafkaCluster.getEarliestLeaderOffsets(partitions)
val earlistTopicAndPartitionTolong: Map[TopicAndPartition, Long] = earlist.right.get.map( t => (t._1,t._2.offset))
if (topicAndPartitionToLongEither.isRight) {
val topicAndPartitionToLong: Map[TopicAndPartition, Long] = topicAndPartitionToLongEither.right.get
//如果存在
//用 ++= 加完以后把新的返回赋值给了 var
//如果该组分区对应的offset比数据最小的offset要小(kafka清理过期数据)
//应该比一下
//融合一下 取大的
println("topicAndPartitionToLong"+topicAndPartitionToLong)
println("earlistTopicAndPartitionTolong"+earlistTopicAndPartitionTolong)
var partitionToLongFinal: Map[TopicAndPartition, Long] =Map()
for (current <- topicAndPartitionToLong) {
for (earlist <- earlistTopicAndPartitionTolong) {
if(current._1 == earlist._1) {
partitionToLongFinal += (current._1 -> math.max(earlist._2,current._2))
}
}
}
println("现在的offset"+partitionToLongFinal)
println("最小的offset"+earlistTopicAndPartitionTolong)
partitionToLong ++= partitionToLongFinal
}else{
//如果不存在,那么获取最小的offset给它//或者给最新的也行
println(earlistTopicAndPartitionTolong)
println("如果不存在,那么获取最小的offset给它//或者给最新的也行")
partitionToLong ++= earlistTopicAndPartitionTolong
// for (topicAndPartition <- partitions) {
// OffsetOutOfRangeException 异常 不能直接给0L
//
// /*
// 这个消费者组如果没有offset保存过,从头消费,应该给当前最小offset
// 因为数据由过期的时间,从0开始,可能0后面的一些数据已经过期了
// */
// partitionToLong += (topicAndPartition -> 0L)
// }
}
}else{
throw new RuntimeException("分区不存在")
}
partitionToLong
}
def setOffset(kafkaCluster: KafkaCluster, groupId: String, kafkaDirectStream: InputDStream[String]) = {
kafkaDirectStream.foreachRDD(rdd => {
//这些要放在里面!!!!!
var offsets: Map[TopicAndPartition, Long] = Map()
val ranges: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]
val offsetRanges: Array[OffsetRange] = ranges.offsetRanges
for (elem <- offsetRanges) {
offsets += (elem.topicAndPartition() -> elem.untilOffset)
}
//这些要放在里面!!!!!
kafkaCluster.setConsumerOffsets(groupId,offsets)
})
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("lower")
val ssc = new StreamingContext(conf,Seconds(3))
val topic = "my"
val groupId = "big10151"
val brokers = "hadoop103:9092,hadoop104:9092,hadoop105:9092"
val deserializer = "org.apache.kafka.common.serialization.StringDeserializer"
val kafkaParams = Map(
//"zookeeper.connect" -> "hadoop103:2181,hadoop104:2181,hadoop105:2181",
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
ConsumerConfig.GROUP_ID_CONFIG -> groupId,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> deserializer,
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> deserializer
)
val kafkaCluster = new KafkaCluster (kafkaParams)
//第五个泛型 指的是最后一个函数 的返回值 message.message()
val kafkaDirectStream = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder,String](
ssc,
kafkaParams,
//fromOffsets(kafkaCluster, groupId, topic),
fromOffsetsMysql(kafkaCluster, groupId, topic),
//这个函数是处理器 可以返回 K V 都可以
( message:MessageAndMetadata[String,String]) => message.message()
)
kafkaDirectStream.print()
//设置最新的 offset
//setOffset(kafkaCluster,groupId,kafkaDirectStream)
setOffsetMysql(kafkaCluster,groupId,kafkaDirectStream)
ssc.start()
ssc.awaitTermination()
}
def fromOffsetsMysql(kafkaCluster: KafkaCluster, groupId: String, topic: String): Map[TopicAndPartition, Long] = {
//1、通过kc查出所有 主题和区信息
val topicAndPartitionsEither: Either[Err, Set[TopicAndPartition]] = kafkaCluster.getPartitions(Set(topic))
var topicAndParitionsToLong = Map[TopicAndPartition, Long]()
if (topicAndPartitionsEither.isRight) {
//2、mysql中的主键 topic + group + partition
//mysql中建表 tbl_offset row topic_group_partition offset
val topicAndPartitions: Set[TopicAndPartition] = topicAndPartitionsEither.right.get
val offsetKeys = topicAndPartitions.map(tap => (tap.topic+"_"+groupId+"_"+tap.partition,tap))
/*
先从kafka集群查出区内所有的最早的offset
1、如果从mysql里面没有查询到offset,那么默认选用这个offset
2、如果从mysql里面查询出来的offset比这个小 ,那么选用这个,不然会出异常
*/
val topicAndPartitionEither: Either[Err, Map[TopicAndPartition, KafkaCluster.LeaderOffset]] = kafkaCluster.getEarliestLeaderOffsets(topicAndPartitions)
val earlistTopicAndPartitions: Map[TopicAndPartition, Long] = topicAndPartitionEither.right.get.map(t =>(t._1,t._2.offset))
val conn: Connection = C3p0Utils.getConnection
offsetKeys.foreach(key =>{
//查询
var sql = "SELECT offset_info FROM tbl_offset WHERE topic_group_partition = ? "
val ps: PreparedStatement = conn.prepareStatement(sql)
ps.setString(1,key._1)
val rs: ResultSet = ps.executeQuery()
if (rs.next()) {
val offset: Int = rs.getInt(1)
//判断 mysql查出来的和最早offset来对比
val earlistOffset = earlistTopicAndPartitions(key._2)
if(offset < earlistOffset){
topicAndParitionsToLong += (key._2 -> earlistOffset)
}else{
topicAndParitionsToLong += (key._2 -> offset)
}
}else{
//如果没有查到,把最早的放到返回的集合
topicAndParitionsToLong += (key._2 -> earlistTopicAndPartitions(key._2))
}
rs.close()
ps.close()
})
conn.close()
}
topicAndParitionsToLong
}
def setOffsetMysql(kafkaCluster: KafkaCluster, groupId: String, kafkaDirectStream: InputDStream[String]) = {
kafkaDirectStream.foreachRDD(rdd => {
val offsetRanges: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]
val ranges: Array[OffsetRange] = offsetRanges.offsetRanges
val conn: Connection = C3p0Utils.getConnection
for (range <- ranges) {
val tap: TopicAndPartition = range.topicAndPartition()
var key = tap.topic+"_"+groupId+"_"+tap.partition
var sql = "INSERT INTO tbl_offset VALUES (?,?) ON DUPLICATE KEY UPDATE offset_info = ? "
val ps: PreparedStatement = conn.prepareStatement(sql)
ps.setString(1,key)
ps.setInt(2,range.untilOffset.toInt)
ps.setInt(3,range.untilOffset.toInt)
ps.executeUpdate()
ps.close()
}
conn.close()
})
}
}