package bi
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka.{HasOffsetRanges, OffsetRange}
//import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import pool.CreateRedisPoolTest
import scala.collection.JavaConverters._
import scala.util.Try
object PileStateTest {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
val sparkConf=new SparkConf().setMaster("local[*]").setAppName("streaming")
val spark=SparkSession.builder().config(sparkConf).getOrCreate()
@transient
val sc=spark.sparkContext
val ssc=new StreamingContext(sc,Seconds(10))
val topic = Array("saas-pile-state")
val groupId = "streaming_test"
val kafkaParams = Map[String, String](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "172.16.1.187:9092",
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "largest",
ConsumerConfig.GROUP_ID_CONFIG -> groupId,
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG->"false"
)
//获取ds
val lines = createStreamingContextRedis(ssc, topic, kafkaParams)
lines.foreachRDD(rdd=>{
rdd.asInstanceOf[HasOffsetRanges].offsetRanges
storeOffset(rdd.asInstanceOf[HasOffsetRanges].offsetRanges,groupId)
println(rdd.count())
rdd.saveAsTextFile("C:\\Users\\nc\\Desktop\\py\\1.txt")
rdd.foreach(x=>{
println(x._1+"\t"+x._2)
})
})
ssc.start()
ssc.awaitTermination()
}
//创建DStream
def createStreamingContextRedis(ssc: StreamingContext, topic: Array[String],
kafkaParams: Map[String, String]): InputDStream[(String, String)] = {
var resDS: InputDStream[(String, String)]= null
val groupId = kafkaParams.get("group.id").get
val (fromOffSet, flag) = getOffset(topic, groupId.toString)
val offsetReset = kafkaParams.get("auto.offset.reset").get
// if (flag == 1 && offsetReset.equals("latest")) {
if (flag == 1 && offsetReset.equals("largest")) {
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
println("从获取偏移量创建DStream")
resDS=KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffSet, messageHandler)
} else {
println("直接创建DStream")
resDS=KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topic.toSet)
}
resDS
}
//从redis获取每个topic和partition对应的offset
def getOffset(topics: Array[String], groupId: String): (Map[TopicAndPartition, Long], Int) = {
val fromOffSets = scala.collection.mutable.Map[TopicAndPartition, Long]()
val redisPool=CreateRedisPoolTest()
val jedis=redisPool.borrowObject()
topics.foreach(topic => {
val keys=jedis.keys(s"bi_kafka_offset_${groupId}_${topic}*")
if (!keys.isEmpty) {
keys.asScala.foreach(key => {
val offset = jedis.get(key)
val partition = Try(key.split(s"bi_kafka_offset_${groupId}_${topic}_").apply(1)).getOrElse("0")
val tp=TopicAndPartition(topic,partition.toInt)
fromOffSets.put(tp, offset.toLong)
})
}
})
redisPool.returnObject(jedis)
if (fromOffSets.isEmpty) {
(fromOffSets.toMap, 0)
} else {
(fromOffSets.toMap, 1)
}
}
//保存offsert
def storeOffset(ranges: Array[OffsetRange], groupId: String): Unit = {
val redisPool=CreateRedisPoolTest()
val jedis=redisPool.borrowObject()
for (o <- ranges) {
val key = s"bi_kafka_offset_${groupId}_${o.topic}_${o.partition}"
val value = o.untilOffset
jedis.set(key, value.toString)
}
redisPool.returnObject(jedis)
}
}
spark-streaming的direct方式保存offset到redis
最新推荐文章于 2020-11-12 11:15:10 发布