/**
* 封装redis的操作工具类
*/
object _03StreamingWithDirectRedisOps {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName("StreamingWithDirectRedis")
.setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds(2))
//kafkautils
val kafkaParams = Map[String, String](
//如果使用了--bootstrap-server参数,那么consumer的信息将会存放在kafka之中kafka的地址
"bootstrap.servers" -> "bigdata01:9092,bigdata02:9092,bigdata03:9092",
"auto.offset.reset" -> "smallest",
/**
*kafka-0.10.1.X版本之前: auto.offset.reset 的值为smallest,和,largest.(offest保存在zk中)
kafka-0.10.1.X版本之后: auto.offset.reset 的值更改为:earliest,latest,和none (offest保存在kafka的一个特殊的topic名为:__consumer_offsets里面)
如果存在已经提交的offest时,不管设置为earliest 或者latest 都会从已经提交的offest处开始消费如果不存在已经提交的offest时,earliest 表示从头开始消费,latest 表示从最新的数据消费,也就是新产生的数据.
none topic各分区都存在已提交的offset时,从提交的offest处开始消费;只要有一个分区不存在已提交的offset,则抛出异常
*/
"group.id" -> "bd-1901-group-3"
)
val topics = "hadoop".split(",").toSet
//获取kafka中的数据
val messages = createMessage(ssc, kafkaParams, topics)
//foreachRDD -->遍历dstream中的每一个rdd
messages.foreachRDD((rdd, bTime) => {
if(!rdd.isEmpty()) {
println("-------------------------------------------")
println(s"Time: $bTime")
println("#####################rdd's count: " + rdd.count())
println("-------------------------------------------")
//存储偏移量
storeOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, kafkaParams("group.id"))
}
})
ssc.start()
ssc.awaitTermination()
}
/**
* 更新offset
* /kafka/consumers/offsets/${topic}/${group}/${partition}
*/
def storeOffsets(offsetRanges: Array[OffsetRange], group:String) = {
//获取redis连接池连接redis
val jedis = JedisUtil.getJedis
for (offsetRange <- offsetRanges) {
//获取topic
val topic = offsetRange.topic
//获取partition
val partition = offsetRange.partition
//获取偏移量
val offset = offsetRange.untilOffset
val field = s"${group}|${partition}"
//将偏移量写入redis
jedis.hset(topic, field, offset.toString)
}
//将redis送回redis的连接池
JedisUtil.returnJedis(jedis)
}
//ssc: StreamingContext这是sparkStreaming的编程入口
kafkaParams: Map[String, String]kafka的配置参数
topics:Set[String]这是kafka主题的集合
def createMessage(ssc: StreamingContext, kafkaParams: Map[String, String], topics:Set[String]): InputDStream[(String, String)] = {
//step 1 读取偏移量(获取group.id集群中topic为hadoop的偏移量)
val fromOffsets:Map[TopicAndPartition, Long] = getFromOffsets(topics, kafkaParams("group.id"))
var messages:InputDStream[(String, String)] = null
if(!fromOffsets.isEmpty) {
//有偏移量获取偏移量根据偏移量读数据
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
//spark读取kafka数据,这种方式定期地从kafka的topic+partition中查询最新的偏移量,再根据偏移量范围在每个batch里面处理数据,使用的是kafka的简单消费者api
messages = KafkaUtils.createDirectStream[String, String,
StringDecoder, StringDecoder,
(String, String)](ssc,
kafkaParams, fromOffsets,
messageHandler)
} else {
//无偏移量
messages = KafkaUtils.createDirectStream[String, String,
StringDecoder, StringDecoder](ssc, kafkaParams, topics)
}
messages
}
def getFromOffsets(topics:Set[String], group:String):Map[TopicAndPartition, Long] = {
//mutable可变的声明map为可变的默认map不可变
val fromOffset = mutable.Map[TopicAndPartition, Long]()
//加入隐式转换
import scala.collection.JavaConversions._
//获取redis的连接
val jedis = JedisUtil.getJedis
//遍历topics
for(topic <- topics) {
//以列表形式返回哈希表的域和域的值。
若 key 不存在,返回空列表。
val map = jedis.hgetAll(topic)
for((field, value) <- map) {//field=group|partition
//获取分区
val partition = field.substring(field.indexOf("|") + 1).toInt
//获取偏移量
val offset = value.toLong
//将topic和partition和偏移量放入分区
fromOffset.put(TopicAndPartition(topic, partition), offset)
}
}
//将redis放回连接池
JedisUtil.returnJedis(jedis)
fromOffset.toMap
}
}
/**
* 使用类似数据库连接池的思想构建redis的连接池
*/
public class JedisUtil {
private JedisUtil(){}
private static JedisPool pool;
static {
Properties properties = new Properties();
try {
properties.load(JedisUtil.class.getClassLoader().getResourceAsStream("jedis.properties"));
String host = properties.getProperty(Constants.JEDIS_HOST);
int port = Integer.valueOf(properties.getProperty(Constants.JEDIS_PORT, "6379"));
JedisPoolConfig config = new JedisPoolConfig();
pool = new JedisPool(config, host, port);
} catch (IOException e) {
e.printStackTrace();
}
}
封装redis的工具类
与redis建立连接相当于redis的连接池
public static Jedis getJedis() {
return pool.getResource();
}
public static void returnJedis(Jedis jedis) {
// pool.returnResource(jedis);
jedis.close();
}
}
用redis管理direct模式下的offset
最新推荐文章于 2022-10-15 23:36:52 发布