目的
将kafka的offset保存到外部的redis数据库中,再次读取的时候也从外部的redis数据库读取
主要步骤
1 从kafka获取要读取的消息的开始offset
2 通过offset读取数据,进行处理
3将读取到的最新的offset更新到redis
演示案例
首先启动生产者
kafka-console-producer.sh \
--broker-list mypc01:9092,mypc02:9092,mypc03:9092 \
--topic pet
下述为consumer.properties的内容,消费者策略从这里提取
bootstrap.servers=mypc01:9092,mypc02:9092,mypc03:9092
group.id=test1
enable.auto.commit=false
key.deserializer=org.apache.kafka.common.serialization.StringDeserializer
value.deserializer=org.apache.kafka.common.serialization.StringDeserializer
zookeeper.servers=mypc01:2181,mypc02:2181,mypc03:2181
实例代码
import org.apache.commons.pool2.impl.GenericObjectPoolConfig
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import redis.clients.jedis.{Jedis, JedisPool}
import java.util
import java.util.Properties
/*
使用redis kv数据库维护kafka主题分区的offset
1 从kafka获取要读取的消息的开始offset
2 通过offset读取数据,进行处理
3将读取到的最新的offset更新到redis
redis 存储offset数据的设计思路:使用hash类型比较ok
test1 pet0 12
pet1 10
pet2 12
*/
object RedisOffsetDemo extends App {
private val conf = new SparkConf().setAppName("test").setMaster("local[*]")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
private val ssc: StreamingContext = new StreamingContext(conf, Seconds(5))
private val properties = new Properties()
//加载消费者的配置文件
properties.load(RedisOffsetDemo.getClass.getClassLoader.getResourceAsStream("consumer.properties"))
//将消费者的配置参数转为Map类型
private val paras: Map[String, String] = Map[String, String](
"bootstrap.servers" -> properties.getProperty("bootstrap.servers"),
"group.id" -> properties.getProperty("group.id"),
"enable.auto.commit" -> properties.getProperty("enable.auto.commit"),
"key.deserializer" -> properties.getProperty("key.deserializer"),
"value.deserializer" -> properties.getProperty("value.deserializer")
)
//定义topic数组
val topics = Array("pet")
private val RedisUtils = new RedisUtils()
//获取jedis对象
private val jedis: Jedis = RedisUtils.getJedis
//获取offsets对象,类型是一个Map
private val offsets: Map[TopicPartition, Long] = RedisUtils.getOffset(jedis, properties)
var dstream: InputDStream[ConsumerRecord[String, String]] = _
//如果offsets不为空,就从offsets处开始消费
if (offsets.nonEmpty) {
//从kafka消费数据,消费的数据构成一个DStream,之后就可以应用各种算子进行处理了
//createDirectStream的第三个参数是个方法,且该方法可以传入一个offsets
//如果需要手动提交,我们需要传入这个offsets
dstream = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array("pet"), paras, offsets))
} else {
//如果offsets为空,就从头开始消费
dstream = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array("pet"), paras))
}
//处理消费者获取的数据
//从kafka获取的Dstream,每一条都是一个ConsumerRecord,就是一条消息
//从消息上可以解析出各种信息
dstream.foreachRDD((rdd: RDD[ConsumerRecord[String, String]]) => {
rdd.foreach((x: ConsumerRecord[String, String]) => {
println(s"partition: ${x.partition()} offset: ${x.offset()} value: ${x.value()}")
//获取offset的最新值
val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
RedisUtils.updateOffsets(properties.getProperty("group.id"), ranges, jedis)
})
})
ssc.start()
ssc.awaitTermination()
}
class RedisUtils {
//定义一个获取jedis的工具方法
def getJedis: Jedis = {
val config = new GenericObjectPoolConfig()
config.setMaxTotal(15)
//最大空闲连接数
config.setMaxIdle(10)
//最小空闲连接数
config.setMinIdle(5)
//创建线程池
val pool = new JedisPool(config, "mypc01", 6379)
//获取连接对象
val jedis: Jedis = pool.getResource
jedis
}
//定义一个获取offsets对象的工具方法
def getOffset(jedis: Jedis, prop: Properties): Map[TopicPartition, Long] = {
//定义一个空的offsets对象
var offsets: Map[TopicPartition, Long] = Map()
//通过组名作为key从redis获取对应的field和value
//本例中就是获取key=test1的fied以及value,返回的是一个map
//就是利用hash类型的key获取hash类型的值
//此处field代表 主题,value代表offset的那个数字
//Map((pet0,11),(pet0,18))
var kvs: util.Map[String, String] = jedis.hgetAll(prop.getProperty("group.id"))
import scala.collection.JavaConversions._
for (kv <- kvs) {
val arr: Array[String] = kv._1.split("#")
//从field解析出topic
val topic: String = arr(0)
从field解析出partition
val partition: Int = arr(1).toInt
//offsets是个map
offsets += (new TopicPartition(topic, partition) -> kv._2.toLong)
}
offsets
}
def updateOffsets(groupName: String, range: Array[OffsetRange], jedis: Jedis): Unit = {
for (x <- range) {
jedis.hset(groupName, x.topic + "#" + x.partition, x.untilOffset.toString)
}
}
}
解析
def createDirectStream[K, V](ssc: StreamingContext, locationStrategy: LocationStrategy, consumerStrategy: ConsumerStrategy[K, V]): InputDStream[ConsumerRecord[K, V]]
DStream的Scala构造函数,其中每个给定的Kafka主题/分区都对应于RDD分区。 spark配置spark.streaming.kafka.maxRatePerPartition
给出每个分区每秒接受的最大消息数。
org.apache.spark.streaming.kafka010
trait HasOffsetRanges
表示任何具有OffsetRanges集合的对象。 这可用于访问由直接Kafka DStream生成的RDD中的偏移范围(请参阅KafkaUtils.createDirectStream)。
KafkaUtils.createDirectStream(...)。foreachRDD {rdd =>
val offsetRanges = rdd.asInstanceOf [HasOffsetRanges] .offsetRanges
...
}