1.使用mysql维护kafka消费消息索引 使用mysql维护消费索引
导入jar包依赖
<dependencies>
<dependency>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
<version>2.3</version>
</dependency>
<!-- sparkstreaming的核心包 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.2.3</version>
</dependency>
<!-- sparkstreaming与kafka的整合包 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.2.3</version>
</dependency>
<!-- sparksql的核心包 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.11</artifactId>
<version>2.2.3</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.28</version>
</dependency>
</dependencies>
<!-- 配置构建信息 -->
<build>
<!-- 资源文件夹 -->
<sourceDirectory>src/main/scala</sourceDirectory>
<!-- 声明并引入构建的插件 -->
<plugins>
<!-- 用于编译Scala代码到class -->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<!-- 程序打包 -->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<!-- 过滤掉以下文件,不打包 :解决包重复引用导致的打包错误-->
<filters>
<filter><artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<!-- 打成可执行的jar包 的主方法入口-->
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass></mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
代码实现:
//如何使用mysql提交并保存kafka的消费索引
//准备工作需提前准备:
//建立一张mysql的kafka的索引表
//CREATE TABLE `t_offset` (
// `topic` varchar(255) NOT NULL,
// `partition` int(11) NOT NULL,
// `groupid` varchar(255) NOT NULL,
// `offset` bigint(20) DEFAULT NULL,
// PRIMARY KEY (`topic`,`partition`,`groupid`)
// ) ENGINE=InnoDB DEFAULT CHARSET=utf8
object kafkaConsumer {
def main(args: Array[String]): Unit = {
// System.setProperty("hadoop.home.dir", "F:\\hadoop")
val conf: SparkConf = new SparkConf().setAppName("kafka-consumer").setMaster("local[6]");
//.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
// 设置日志级别
// val sc = new SparkContext()
// sc.setLogLevel("WARN")
val ssc = new StreamingContext(conf, Durations.minutes(1))
// ssc.checkpoint("check")
val param: Map[String, Object] = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "kafka_cdh:9092,kafka_cdh:9092,kafka_cdh:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "no2",
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest",
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer",
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer",
// "auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("topic_name")
val offsetMap: mutable.Map[TopicPartition, Long] = OffsetUtil.getOffsetMap("no2","topic_name")
val recordDStream: InputDStream[ConsumerRecord[String, String]] = if(offsetMap.size > 0){
//有记录offset,从该offset处开始消费
KafkaUtils.createDirectStream[String, String](ssc,
LocationStrategies.PreferConsistent,//位置策略:该策略,会让Spark的Executor和Kafka的Broker均匀对应
ConsumerStrategies.Subscribe[String, String](topics, param,offsetMap))//消费策略
}else{
//MySQL中没有记录offset,则直接连接,从latest开始消费
KafkaUtils.createDirectStream[String, String](ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, param))
}
//手动保存offset
recordDStream.foreachRDD(rdd=>{
if(rdd.count() > 0){//当前这一时间批次有数据
rdd.foreach(record => println("接收到的Kafka发送过来的数据为:" + record))
//接收到的Kafk发送过来的数据为:ConsumerRecord(topic = spark_kafka, partition = 1, offset = 6, CreateTime = 1565400670211, checksum = 1551891492, serialized key size = -1, serialized value size = 43, key = null, value = hadoop spark ...)
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
for (o <- offsetRanges){
println(s"topic=${o.topic},partition=${o.partition},fromOffset=${o.fromOffset},untilOffset=${o.untilOffset}")
}
//手动提交offset,默认提交到Checkpoint中
//recordDStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
OffsetUtil.saveOffsetRanges("no2",offsetRanges)
}
})
//此处获取kafk消息体中的value,也就是应该编写自己消费逻辑的部分
val ds: DStream[String] = recordDStream.map(_.value())
ds.print()
ssc.start()
ssc.awaitTermination()
}
object OffsetUtil {
/**
* 从数据库读取偏移量
*/
def getOffsetMap(groupid: String, topic: String) = {
val connection = DriverManager.getConnection("jdbc:mysql://host:3306/database?characterEncoding=UTF-8", "root","root")
val pstmt = connection.prepareStatement("select * from t_offset where groupid=? and topic=?")
pstmt.setString(1, groupid)
pstmt.setString(2, topic)
val rs: ResultSet = pstmt.executeQuery()
val offsetMap = mutable.Map[TopicPartition, Long]()
while (rs.next()) {
offsetMap += new TopicPartition(rs.getString("topic"), rs.getInt("partition")) -> rs.getLong("offset")
}
if(rs != null){
rs.close()
}
if (pstmt != null){
pstmt.close()
}
if (connection != null){
connection.close()
}
offsetMap
}
/**
* 将偏移量保存到数据库
*/
def saveOffsetRanges(groupid: String, offsetRange: Array[OffsetRange]) = {
val connection = DriverManager.getConnection("jdbc:mysql://host:3306/database?characterEncoding=UTF-8", "root","root")
//replace into表示之前有就替换,没有就插入
val pstmt = connection.prepareStatement("replace into t_offset (`topic`, `partition`, `groupid`, `offset`) values(?,?,?,?)")
for (o <- offsetRange) {
pstmt.setString(1, o.topic)
pstmt.setInt(2, o.partition)
pstmt.setString(3, groupid)
pstmt.setLong(4, o.untilOffset)
pstmt.executeUpdate()
}
// pstmt.close()
// connection.close()
if (pstmt != null){
pstmt.close()
}
if (connection != null){
connection.close()
}
}
}
}
2.Redis维护Kafka索引(多主题,多分区)
使用sparkStreaming消费kafka数据,手动维护kafka的消费分区索引,从而保证数据的一致性。
// 2.使用redis维护kafka的消费索引
//原理跟mysql的方法逻辑一样
//在之前的pom.xml需加上一个redis相关的jar包
package cn.ac.iie.hy.datatrans.offset
import cn.ac.iie.hy.datatrans.server.SparkStreamingKafkaOffsetRedisRecoveryNew.scala_convert
import kafka.api.PartitionOffsetRequestInfo
import kafka.common.TopicAndPartition
import kafka.javaapi.consumer.SimpleConsumer
import kafka.javaapi.{OffsetRequest, PartitionMetadata, TopicMetadataRequest, TopicMetadataResponse}
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import redis.clients.jedis.Jedis
import scala.collection.immutable.Map
import scala.collection.mutable
import scala.collection.mutable.HashMap
object Redis2SaveKafkaOffset{
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("redis2saveKafkaOffset").setMaster("local[*]")
val streamingContext = new StreamingContext(conf, Seconds(3))
streamingContext.sparkContext.setLogLevel("WARN")//设置日志级别
val topics = Array("topic01", "topic02")
val groups = Array("group01", "group02")
val tuples = topics.zip(groups)
//kafka的参数
var kafkaParams = Map[String, Object](
"bootstrap.servers" -> "kafka_cdh:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"auto.offset.reset" -> "earliest",
//修改为手动提交偏移量
"enable.auto.commit" -> (false: java.lang.Boolean)
)
tuples.foreach(topic_group => {
val jedis = new Jedis("host", 6379)
val topic = topic_group._1
val groupId = topic_group._2
val redisKey = s"${groupId}_${topic}"
//判断redis中是否保存过历史的offset
if (jedis.exists(redisKey)) {
println(s"topic:$topic reids不存在offset!")
val Java_offsetMap: java.util.Map[String, String] = jedis.hgetAll(redisKey) //partition -> offset
val offsetMap: Map[String, String] = scala_convert(Java_offsetMap) //java Map 转scala Map
val partitionToLong: mutable.HashMap[TopicPartition, Long] = getEffectiveOffsets(offsetMap, topic, "localhost")
println("merge: 合并后的offset")
println(partitionToLong.toBuffer)
kafkaParams += "group.id" -> groupId
val stream = KafkaUtils.createDirectStream[String, String](
streamingContext,
//位置策略
PreferConsistent,
//订阅的策略
Subscribe[String, String](Array(topic), kafkaParams, partitionToLong)
)
stream.foreachRDD { (rdd, time) =>
//获取该RDD对应的偏移量,记住只有kafka的rdd才能强转成HasOffsetRanges类型
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//执行这个rdd的aciton,这里rdd的算子是在集群上执行的
rdd.foreach { line =>
//println(s"time:${time}==>${line.key()} ${line.value()}")
}
//foreach和foreachPartition的区别
//foreachPartition不管有没有数据都会执行自己的function
//foreach只在有数据时执行自己的function
// rdd.foreachPartition(it =>{
// val list: List[ConsumerRecord[String, String]] = it.toList
// println(list)
// })
//将offset保存回redis
val pipeline = jedis.pipelined()
offsetRanges.foreach(eachRange => {
/**
* redis结构
* key: {groupId}_${topic}
* value :Map(partition -> offset )
*/
val topic: String = eachRange.topic
val fromOffset: Long = eachRange.fromOffset
val endOffset: Long = eachRange.untilOffset
val partition: Int = eachRange.partition
val redisKey = s"${groupId}_${topic}"
pipeline.hset(redisKey,partition.toString,endOffset.toString)
println(s"time $time topic:${eachRange.topic} partitioner:${eachRange.partition}_offset : ${eachRange.untilOffset.toString}")
})
pipeline.sync()
}
}else{
println(s"topic:$topic reids不存在offset!")
kafkaParams += "group.id" -> groupId
val stream = KafkaUtils.createDirectStream[String, String](
streamingContext,
//位置策略
PreferConsistent,
//订阅的策略
Subscribe[String, String](Array(topic), kafkaParams)
)
stream.foreachRDD { (rdd, time) =>
//获取该RDD对应的偏移量,记住只有kafka的rdd才能强转成HasOffsetRanges类型
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//执行这个rdd的aciton,这里rdd的算子是在集群上执行的
rdd.foreach { line =>
//println(s"time:${time}==>${line.key()} ${line.value()}")
}
//将offset保存回redis
val pipeline = jedis.pipelined()
offsetRanges.foreach(eachRange => {
/**
* redis结构
* key: {groupId}_${topic}
* value :Map(partition -> offset )
*/
val topic: String = eachRange.topic
val fromOffset: Long = eachRange.fromOffset
val endOffset: Long = eachRange.untilOffset
val partition: Int = eachRange.partition
val redisKey = s"${groupId}_${topic}"
pipeline.hset(redisKey,partition.toString,endOffset.toString)
println(s"time $time topic:${eachRange.topic} partitioner:${eachRange.partition} _offset : ${eachRange.untilOffset.toString}")
})
pipeline.sync()
}
}
})
streamingContext.start()
streamingContext.awaitTermination()
}
def getEffectiveOffsets(offsetMap: Map[String, String], topic: String, host: String): HashMap[TopicPartition, Long] = {
// 存储Redis的offset
val redisOffsetMap = new HashMap[TopicPartition, Long]
offsetMap.foreach(patition_offset => {
val tp = new TopicPartition(topic, patition_offset._1.toInt)
redisOffsetMap += tp -> patition_offset._2.toLong
})
println(s" kafka该主题:$topic ----------Redis 维护的offset----------------")
println(redisOffsetMap.toBuffer)
//**********用于解决SparkStreaming程序长时间中断,再次消费时已记录的offset丢失导致程序启动报错问题
import scala.collection.mutable.Map
//存储kafka集群中每个partition当前最早的offset
val clusterEarliestOffsets = Map[Long, Long]()
val consumer: SimpleConsumer = new SimpleConsumer(host, 9092, 100000, 64 * 1024,
"leaderLookup" + System.currentTimeMillis())
//使用隐式转换进行java和scala的类型的互相转换
import scala.collection.convert.wrapAll._
val request: TopicMetadataRequest = new TopicMetadataRequest(List(topic))
val response: TopicMetadataResponse = consumer.send(request)
consumer.close()
//<topic1_Metadata(p1,p2) topic2_Metadata(p1)> => <topic1_Metadata_p1 ,topic1_Metadata_p2 ,topic1_Metadata_p1>
val metadatas: mutable.Buffer[PartitionMetadata] = response.topicsMetadata.flatMap(f => f.partitionsMetadata)
//从kafka集群中得到当前每个partition最早的offset值
metadatas.map(f => {
val partitionId: Int = f.partitionId
val leaderHost: String = f.leader.host
val leaderPort: Int = f.leader.port
val clientName: String = "Client_" + topic + "_" + partitionId
val consumer: SimpleConsumer = new SimpleConsumer(leaderHost, leaderPort, 100000,
64 * 1024, clientName)
val topicAndPartition = new TopicAndPartition(topic, partitionId)
val requestInfo = new HashMap[TopicAndPartition, PartitionOffsetRequestInfo]();
//kafka.api.OffsetRequest.LatestTime
requestInfo.put(topicAndPartition, new PartitionOffsetRequestInfo(kafka.api.OffsetRequest.EarliestTime, 1));
val request = new OffsetRequest(requestInfo, kafka.api.OffsetRequest.CurrentVersion, clientName)
val response = consumer.getOffsetsBefore(request)
val offsets: Array[Long] = response.offsets(topic, partitionId)
consumer.close()
clusterEarliestOffsets += ((partitionId, offsets(0)))
})
println(s"-------topic:$topic 最早offset--------------")
println(clusterEarliestOffsets)
//外循环是kafka 最早offsets
for ((clusterPartition, clusterEarliestOffset) <- clusterEarliestOffsets) {
val tp = new TopicPartition(topic, clusterPartition.toInt)
val option: Option[Long] = redisOffsetMap.get(tp)
// kafka 有的分区,但Redis 没有, 原因:kafka新增了分区
if (option.isEmpty) { //取最早的offset
println(s"====>topic:$topic 新增了分区: $tp")
redisOffsetMap += (tp -> clusterEarliestOffset)
} else {
var redisOffset: Long = option.get
if (redisOffset < clusterEarliestOffset) { //redis中存的offset对比最早的offset已经丢失,取最早的offset
redisOffset = clusterEarliestOffset
redisOffsetMap += (tp -> redisOffset)
}
}
}
redisOffsetMap
}
}