为了实现从kafka读取数据时,避免从头开始的重复读,也避免从最新开始的跳过数据读,有必要在读取数据并处理成功后,记录消费偏移量。当系统重启或者升级维护后,就可以继续从上次处理完成的位置进行继续读取了。
1. 添加依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.2.1</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>3.2.1</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<!-- <version>2.10.1</version>-->
<version>2.13.1</version>
</dependency>
jackson-core版本为2.10.1时,出现了java.lang.ClassNotFoundException错误。
2. 添加工具类
package com.shopstatis.utils
import scala.collection.JavaConversions._
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
import scala.collection.mutable
object KafkaZkCheckPoint {
// ZK client
val client = {
val client = CuratorFrameworkFactory
.builder
.connectString("192.168.0.181:2181")
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
.namespace("mykafka")
.build()
client.start()
client
}
// offset 路径起始位置
val Globe_kafkaOffsetPath = "/kafka/offsets"
// 路径确认函数 确认ZK中路径存在,不存在则创建该路径
def ensureZKPathExists(path: String)={
if (client.checkExists().forPath(path) == null) {
client.create().creatingParentsIfNeeded().forPath(path)
}
}
// 保存 新的 offset
def storeOffsets(offsetRange: Array[OffsetRange], groupName:String) = {
for (o <- offsetRange){
val zkPath = s"${Globe_kafkaOffsetPath}/${groupName}/${o.topic}/${o.partition}"
// 向对应分区第一次写入或者更新Offset 信息
println("---Offset写入ZK------\nTopic:" + o.topic +", Partition:" + o.partition + ", Offset:" + o.untilOffset)
// 检查路径是否存在
ensureZKPathExists(zkPath)
client.setData().forPath(zkPath, o.untilOffset.toString.getBytes())
}
}
def getFromOffset(topic: Array[String], groupName:String):(Map[TopicPartition, Long], Int) = {
// Kafka 0.8和0.10的版本差别,0.10 为 TopicPartition 0.8 TopicAndPartition
var fromOffset: Map[TopicPartition, Long] = Map()
val topic1 = topic(0).toString
// 读取ZK中保存的Offset,作为Dstrem的起始位置。如果没有则创建该路径,并从 0 开始Dstream
val zkTopicPath = s"${Globe_kafkaOffsetPath}/${groupName}/${topic1}"
// 检查路径是否存在
ensureZKPathExists(zkTopicPath)
// 获取topic的子节点,即 分区
val childrens = client.getChildren().forPath(zkTopicPath)
// 遍历分区
val offSets: mutable.Buffer[(TopicPartition, Long)] = for {
p <- childrens
}
yield {
// 遍历读取子节点中的数据:即 offset
val offsetData = client.getData().forPath(s"$zkTopicPath/$p")
// 将offset转为Long
val offSet = java.lang.Long.valueOf(new String(offsetData)).toLong
// 返回 (TopicPartition, Long)
(new TopicPartition(topic1, Integer.parseInt(p)), offSet)
}
println(offSets.toMap)
if(offSets.isEmpty){
(offSets.toMap, 0)
} else {
(offSets.toMap, 1)
}
}
}
3. 创建流时读取偏移
package com.shopstatis.utils
import com.shopstatis.utils.KafkaZkCheckPoint.getFromOffset
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import java.util.Properties
object MyKafkaUtil {
//1.创建配置信息对象
private val properties: Properties = PropertiesUtil.load("config.properties")
//2.用于初始化链接到集群的地址
val broker_list: String = properties.getProperty("kafka.broker.list")
val groupid: String = properties.getProperty("kafka.groupid")
//3.kafka 消费者配置
val kafkaParam = Map(
"bootstrap.servers" -> broker_list,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
//消费者组
"group.id" -> groupid,
//如果没有初始化偏移量或者当前的偏移量不存在任何服务器上,可以使用这个配置属性
//可以使用这个配置,latest 自动重置偏移量为最新的偏移量
"auto.offset.reset" -> "latest",
//如果是 true,则这个消费者的偏移量会在后台自动提交,但是 kafka 宕机容易丢失数据
//如果是 false,会需要手动维护 kafka 偏移量
"enable.auto.commit" -> (true: java.lang.Boolean)
)
// 创建 DStream,返回接收到的输入数据
// LocationStrategies:根据给定的主题和集群地址创建 consumer
// LocationStrategies.PreferConsistent:持续的在所有 Executor 之间分配分区
// ConsumerStrategies:选择如何在 Driver 和 Executor 上创建和配置 Kafka Consumer
// ConsumerStrategies.Subscribe:订阅一系列主题
def getKafkaStream(topic: String, ssc: StreamingContext):
InputDStream[ConsumerRecord[String, String]] = {
val topics = Array(topic)
val (fromOffsets, flag) = getFromOffset(topics, groupid)
var dStream: InputDStream[ConsumerRecord[String, String]] = null;
// 读取偏移
if (flag == 1) {
// 输出偏移:
println("fromOffsets is:" + fromOffsets)
dStream = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe(topics, kafkaParam, fromOffsets))
println(fromOffsets)
println("中断后 Streaming 成功!")
}
else {
dStream =
KafkaUtils.createDirectStream[String, String](ssc,
LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String,
String](Array(topic), kafkaParam))
}
dStream
}
}
4. 处理完成后记录偏移
kafkaDStream.foreachRDD(rdd =>{
// 存储新的offset
storeOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, groupid)
})
注意:记录偏移需要在原始的kafka流对应的rdd上处理,而不能在转换后的流的rdd上处理。
5. 配置文件config.properties
# Kafka 配置
kafka.broker.list=192.168.0.181:9092
kafka.topic=flink-topic
kafka.groupid=test-consumer-group
6. 属性读取工具类
package com.shopstatis.utils
import java.io.InputStreamReader
import java.util.Properties
object PropertiesUtil {
def load(propertiesName:String): Properties ={
val prop=new Properties()
prop.load(new
InputStreamReader(Thread.currentThread().getContextClassLoader.getResourceAsStream(propertiesName) , "UTF-8"))
prop
}
}
7. 程序执行后,可以从zookeeper看到保存的偏移
[zk: localhost:2181(CONNECTED) 38] get /mykafka/kafka/offsets/test-consumer-group/flink-topic/0
2692589
通过将消息处理进度偏移保存在zookeeper的方式,实现了系统重启或维护后,系统继续运行的连续性和一致性。