spark手动维护kafka偏移量录标
package common
import java.sql.{DriverManager, ResultSet}
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
import scala.collection.mutable
object OffsetUtil {
def getOffsetMap(groupId:String, topics: Array[String], mysqlJdbcUrl:String, mysqlUsername:String, mysqlPassword:String)={
val connection = DriverManager.getConnection(mysqlJdbcUrl, mysqlUsername, mysqlPassword)
val pstmt = connection.prepareStatement("select `topic`,`partition`,`offset` from dataServer_offset where `groupId`=? and `topic`=?")
val offsetMap = new mutable.HashMap[TopicPartition, Long]()
var result: ResultSet = null
topics.foreach(topic => {
pstmt.setString(1, groupId)
pstmt.setString(2, topic)
result = pstmt.executeQuery()
while (result.next()){
offsetMap += new TopicPartition(result.getString("topic"), result.getInt("partition")) -> result.getLong("offset")
}
})
result.close()
pstmt.close()
connection.close()
offsetMap
}
def saveOffset(groupid: String, offsetRange: Array[OffsetRange], mysqlJdbcUrl:String, mysqlUsername:String, mysqlPassword:String) = {
val connection = DriverManager.getConnection(mysqlJdbcUrl, mysqlUsername, mysqlPassword)
val pstmt = connection.prepareStatement("replace into dataServer_offset(`topic`, `partition`, `groupId`, `offset`) values(?,?,?,?)")
offsetRange.foreach(o =>{
pstmt.setString(1, o.topic)
pstmt.setInt(2, o.partition)
pstmt.setString(3, groupid)
pstmt.setLong(4, o.untilOffset)
pstmt.executeUpdate()
})
pstmt.close()
connection.close()
}
}
package service.container
import cmb.zh.data.scala.common._
import cmb.zh.data.scala.common.ConfigProperties
import com.alibaba.fastjson.JSON
import common.OffsetUtil
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.security.plain.PlainLoginModule
import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer}
import org.apache.spark.SparkConf
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010._
import scala.collection.immutable.HashMap
object MPPRLogService {
def main(args: Array[String]): Unit = {
val conf = new ConfigProperties("D://test.properties")
val sparkConf = new SparkConf().setAppName("app.name").setMaster("local")
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val scc = new StreamingContext(sparkConf, Seconds(conf.get("streaming.batch.duration").toLong))
val topics = conf.get("input.kafka.topics").split(",")
val username = conf.get("input.kafka.username")
val password = conf.get("input.kafka.password")
val saslJaasConfig = classOf[PlainLoginModule].getName + " required username=\"" + username + "\" password=\"" + password + "\";"
val kafkaServers = conf.get("input.kafka.bootstrap.servers")
val groupId = conf.get("input.kafka.group.id")
val kafkaParam = Map("bootstrap.servers" -> kafkaServers,
"sasl.jaas.config" -> saslJaasConfig,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"auto.offset.reset" -> "latest",
"security.protocol" -> "SASL_PLAINTEXT",
"sasl.mechanism" -> "PLAIN")
if (conf.get("input.kafka.session.timeout.ms", null) != null) {
kafkaParam.+("session.timeout.ms" -> conf.get("input.kafka.session.timeout.ms"))
}
if (conf.get("input.kafka.max.poll.interval.ms", null) != null) {
kafkaParam.+("max.poll.interval.ms" -> conf.get("input.kafka.max.poll.interval.ms"))
}
if (conf.get("input.kafka.max.poll.records", null) != null) {
kafkaParam.+("max.poll.records" -> conf.get("input.kafka.max.poll.records"))
}
try {
val mysqlUsername = conf.get("offset.mysql.username")
val mysqlPassword = conf.get("offset.mysql.password")
val mysqlJdbcUrl = conf.get("offset.mysql.jdbc.url")
val offsetMap = OffsetUtil.getOffsetMap(groupId, topics, mysqlJdbcUrl, mysqlUsername, mysqlPassword)
val kafkaStream = if (offsetMap.size > 0) {
KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParam, offsetMap))
} else {
KafkaUtils.createDirectStream[String, String](
scc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParam))
}
kafkaStream.foreachRDD(rdd => {
if (rdd.count() > 0) {
rdd.map(_.value()).map(rawLogStr => {
LogProcessUtil.getLogContentFromKafka(rawLogStr)
}).filter(originLogStr => {
println("原始日志: " + originLogStr)
LogProcessUtil.platLogCheck(originLogStr)
}).map(logStr => {
LogProcessUtil.getPlatformLogJson(logStr)
}).foreach(record => {
try {
println("json字符串日志: " + record)
val recordJson = JSON.parseObject(record)
if (recordJson.get("chlid").equals("1000")) {
kafkaProducer.value.send(outputTopicMap("1000"), record)
saveOffset(rdd, groupId, mysqlJdbcUrl, mysqlUsername, mysqlPassword)
} else if (recordJson.get("chlid").equals("1001")) {
kafkaProducer.value.send(outputTopicMap("1001"), record)
saveOffset(rdd, groupId, mysqlJdbcUrl, mysqlUsername, mysqlPassword)
}
} catch {
case e: Throwable => println(e)
}
})
}
})
} catch {
case e: Exception => print(e)
}
scc.start()
scc.awaitTermination()
}
def saveOffset(rdd:RDD[ConsumerRecord[String, String]], groupId: String, mysqlJdbcUrl: String, mysqlUsername: String, mysqlPassword: String) = {
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
for (o <- offsetRanges) {
println(s"topic=${o.topic},partition=${o.partition},fromOffset=${o.fromOffset},untilOffset=${o.untilOffset}")
}
OffsetUtil.saveOffset(groupId, offsetRanges, mysqlJdbcUrl, mysqlUsername, mysqlPassword)
}
def selectTopic(topicArray: Array[String]): Map[String, String] = {
var resultMap: Map[String, String] = new HashMap[String, String]()
topicArray.foreach(topic => {
val topicArray = topic.split("_").toArray
resultMap += (topicArray(5) -> topic)
})
return resultMap
}
}