import java.lang
import Utils.OffsetManager
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
//mysql管理偏移量
object SSCDirectKafka010_Mysql_Offset {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("SSCDirectKafka010_Mysql_Offset")
//配置在kafka中每次拉取的数据量,这里配置的2并不是每次在kafka拉取2条数据,而是:2*分区数量*采样时间(12)
conf.set("spark.streaming.kafka.maxRatePerPartition", "2")
//是否优雅的停止你的SparkStreaming,如果不加这个参数的话,服务停止的时候可能会造成数据的丢失
conf.set("spark.streaming.stopGracefullyOnShutdown", "true")
val ssc = new StreamingContext(conf,Seconds(2))
//消费者ID
val groupId = "day11_07"
//设置消费者参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hadoop01:9092,hadoop02:9092,hadoop03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
//从头的数据开始消费earliest
"auto.offset.reset" -> "earliest",
//是否自动提交偏移量
"enable.auto.commit" -> (false: lang.Boolean)
)
//指定主题
val topic = "helloTopic"
val topics: Array[String] = Array(topic)
//可以传多个主题(这里是一个)
val offsetManager = OffsetManager(groupId,topic)
val result = if(offsetManager.size>0){
KafkaUtils.createDirectStream[String,String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](topics,kafkaParams,offsetManager)
)
}else{
KafkaUtils.createDirectStream[String,String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](topics,kafkaParams)
)
}
result.foreachRDD(rdd=>{
//获取当前偏移量
val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
ranges.foreach(println(_))
//管理偏移量 sql和redis
//手动提交偏移量
//result.asInstanceOf[CanCommitOffsets].commitAsync(ranges)
//将偏移量存入mysql
OffsetManager.saveCurrentBatchOffset(groupId,ranges)
})
ssc.start()
ssc.awaitTermination()
}
}
#############################工具Utils
import java.sql.DriverManager
import com.typesafe.config.{Config, ConfigFactory}
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
object OffsetManager {
//读取配置文件
val config: Config = ConfigFactory.load()
//数据库连接参数配置
def getConn = {
DriverManager.getConnection(
config.getString("db.url"),
config.getString("db.user"),
config.getString("db.password")
)
}
/*
获取偏移量信息
*/
def apply(groupId: String,topic:String) = {
val conn = getConn
val statement = conn.prepareStatement("select * from kafka_offset where groupId=? and topic=?")
statement.setString(1,groupId)
statement.setString(2,topic)
val rs = statement.executeQuery()
//注意导包
import scala.collection.mutable._
val offsetRange = Map[TopicPartition,Long]()
while (rs.next()){
//讲获取的数据放到map中
offsetRange += new TopicPartition(rs.getString("topic"),rs.getInt("partition")) -> rs.getLong("untilOffset")
}
rs.close()
statement.close()
conn.close()
offsetRange
}
/*
保存当前偏移量到数据库
*/
def saveCurrentBatchOffset(groupId:String,offsetRange:Array[OffsetRange]) = {
val conn = getConn
val statement = conn.prepareStatement("replace into kafka_offset values(?,?,?,?)")
for (i <- offsetRange){
statement.setString(1,i.topic)
statement.setInt(2,i.partition)
statement.setLong(3,i.untilOffset)
statement.setString(4,groupId)
statement.executeUpdate()
}
statement.close()
conn.close()
}
}