一、导入依赖
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>4.1.0</version>
</dependency>
<!-- spark streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>3.1.2</version>
</dependency>
二、JedisUtils——工具类
import java.util
import org.apache.commons.pool2.impl.GenericObjectPoolConfig
import org.apache.kafka.common.TopicPartition
import redis.clients.jedis.{Jedis, JedisPool}
import scala.collection.{JavaConverters, mutable}
object JedisUtils {
/**
* 获取到指定的主题的偏移量
*/
def getFromOffsets(topics: Set[String]): Map[TopicPartition, Long] = {
//1. 创建map用于存放最终的结果
val map: mutable.Map[TopicPartition, Long] = mutable.Map[TopicPartition, Long]()
//2. 获取到jedis
val jedis: Jedis = JedisUtils.getDefaultJedis
//3. 遍历,然后将结果数据封装到map中
topics.foreach(topic => {
val gpo = JavaConverters.mapAsScalaMap(jedis.hgetAll(topic))
for ((gp:String, offset:String) <- gpo) {
val fields: Array[String] = offset.split("\\|")
fields.foreach(field => {
val po: Array[String] = field.split(":")
map.put(new TopicPartition(topic, po(0).toInt), po(1).toLong)
})
}
})
//4. 返回结果数据
map.toMap
}
private val DEFAULT_HOST = "x.x.x.x" // 写你自己的ip
private val DEFAULT_PORT = 6379
private val config = new GenericObjectPoolConfig[Jedis]
config.setMaxTotal(100) // 设置最大连接数
config.setMaxIdle(50) // 设置最大空闲连接数
config.setMinIdle(10) // 设置最小空闲连接数
private val defaultPool = new JedisPool(config, DEFAULT_HOST, DEFAULT_PORT)
private var pool:JedisPool = null
def getDefaultJedis: Jedis = defaultPool.getResource
def initPool(host: String, port: Int): Unit = pool = new JedisPool(config, host, port)
/**
* 这个方法调用之前,一定要保证initPool方法被调用过了
*/
def getJedis: Jedis = if (pool != null) pool.getResource
else null
//关闭放回
def close(jedis: Jedis, isDefault: Boolean): Unit = {
if (jedis != null && isDefault) defaultPool.returnResource(jedis)
else if (jedis != null && !isDefault) pool.returnResource(jedis)
}
}
三、SparkUtils——工具类
import com.qf.bigdata.spark.streaming.day2.JedisUtils
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import redis.clients.jedis.Jedis
/**
* 自定义Spark工具类
*/
object SparkUtils {
/**
* 获取StreamingContexts对象
*/
def getLocalStreamingContext():StreamingContext = getLocalStreamingContext("default_app", "5")
def getLocalStreamingContext(appName:String, second:String):StreamingContext = getStreamingContext("local[*]", appName, second)
def getStreamingContext(master:String, appName:String, second:String):StreamingContext = new StreamingContext(master, appName, Seconds(second.toInt))
/**
* 释放资源
*/
def close(ssc:StreamingContext):Unit = if (ssc != null && !ssc.sparkContext.isStopped) ssc.stop()
/**
* 帮助我们通过Spark Streaming对接kafka数据并自动管理offset
*/
def streamingFromKafkaAndManagedOffset(ssc:StreamingContext, topics:Set[String], kafkaParams:Map[String, String]):InputDStream[ConsumerRecord[String, String]] = {
//1.2 通过获取到消费者组值,去查询redis中的偏移量数据
val offsets: Map[TopicPartition, Long] = JedisUtils.getFromOffsets(topics)
//1.3 如果偏移量没有,说明是第一次消费,就从头开始消费
var messages: InputDStream[ConsumerRecord[String, String]] = null
if (offsets.isEmpty) {
messages = KafkaUtils.createDirectStream[String, String](
ssc, // StreamingContext,
LocationStrategies.PreferConsistent, // locationStrategy,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) // consumerStrategy : 消费者策略
)
}else {
// 不是第一次消费,从指定的偏移量开始读取数据
messages = KafkaUtils.createDirectStream[String, String](
ssc, // StreamingContext,
LocationStrategies.PreferConsistent, // locationStrategy,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, offsets) // consumerStrategy : 消费者策略
)
}
messages
}
}
四、Demo1_Offset_Redis
import com.qf.bigdata.spark.core.day3.SparkUtils
import com.qf.bigdata.spark.core.day5.LoggerTrait
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, OffsetRange}
object Demo1_Offset_Redis extends LoggerTrait{
def main(args: Array[String]): Unit = {
//1. 获取到核心对象
val ssc: StreamingContext = SparkUtils.getLocalStreamingContext()
//2. 配置参数
val topics: Set[String] = "spark".split(",").toSet
val kafkaParams = Map[String, String](
"bootstrap.servers" -> "x.x.x.x:9092", //x.x.x.x =>你的ip
"group.id" -> "bigdata23", //kafka的分组
"auto.offset.reset" -> "latest", // latest:消费最新的消息, earliest:从最初的消费
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"enable.auto.commit" -> "false"
)
//3. 获取到读取到数据的流
val messages: InputDStream[ConsumerRecord[String, String]] = SparkUtils.streamingFromKafkaAndManagedOffset(ssc, topics, kafkaParams)
//4. 打印数据流
messages.foreachRDD(rdd => {
if (!rdd.isEmpty()) {
//4. 对rdd进行数据类型转换才能获取到rdd的分区,偏移量,主题等等信息
val ranges: HasOffsetRanges = rdd.asInstanceOf[HasOffsetRanges]
//5. 这个数组包含了存放的每一条消息(不是消息本身,而是消息的元数据:偏移量、主题、分区...)
val offsetRanges: Array[OffsetRange] = ranges.offsetRanges
//6. 遍历: offsetRange就是每条消息的主题、分区、偏移量等信息
offsetRanges.foreach(offsetRange => {
val topic: String = offsetRange.topic
val partition: Int = offsetRange.partition
val fromOffset: Long = offsetRange.fromOffset //start
val utilOffset: Long = offsetRange.untilOffset // end
println(s"topic : ${topic}, partition : ${partition}, start : ${fromOffset}, end : ${utilOffset}")
// storeOffsets()
})
}
})
ssc.start()
ssc.awaitTermination()
}
}
五、测试结果