1.导入maven
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.1.9</version>
</dependency>
2.在mysql中创建表
CREATE TABLE `offset_manager` (
`groupid` varchar(50) DEFAULT NULL,
`topic` varchar(50) DEFAULT NULL,
`partition` int(11) DEFAULT NULL,
`untiloffset` bigint(20) DEFAULT NULL,
UNIQUE KEY `offset_unique` (`groupid`,`topic`,`partition`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1
3.编写kafka工具类
import java.sql.{Connection, PreparedStatement, ResultSet}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import scala.collection.immutable.HashMap
import scala.collection.mutable
object MyKafkaUtil {
def getKafkaDStream(ssc: StreamingContext, topic: String): InputDStream[ConsumerRecord[String, String]] = {
val kafkaPra: HashMap[String, String] = getKafkaMap
val offsetMap: mutable.HashMap[TopicPartition, Long] = getOffset(kafkaPra("group.id"),topic)
if (offsetMap.isEmpty) {
val consumerStrategy: ConsumerStrategy[String, String] = ConsumerStrategies.Subscribe[String, String](Seq(topic), kafkaPra)
KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, consumerStrategy)
} else {
val consumerStrategy: ConsumerStrategy[String, String] = ConsumerStrategies.Subscribe[String, String](Seq(topic), kafkaPra, offsetMap)
KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, consumerStrategy)
}
}
def getKafkaMap: HashMap[String, String] = {
HashMap("bootstrap.servers" -> "haoop100:9092,hadoop101:9092,haoop102:9092",
"group.id" -> "spark_0615",
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> "false")
}
def getOffset(groupid: String, topic: String): mutable.HashMap[TopicPartition, Long] = {
val connection: Connection = JDBCUtil.getConnection
val sql = "select topic,`partition`,untiloffset from offset_manager where groupid=? and topic=?"
val preparedStatement: PreparedStatement = connection.prepareStatement(sql)
preparedStatement.setString(1, groupid)
preparedStatement.setString(2, topic)
val resultSet: ResultSet = preparedStatement.executeQuery()
val hashMap = new mutable.HashMap[TopicPartition, Long]
while (resultSet.next()) {
val topicPartition = new TopicPartition(resultSet.getString("topic"), resultSet.getInt("partition"))
hashMap.put(topicPartition, resultSet.getLong("untiloffset"))
}
resultSet.close()
preparedStatement.close()
JDBCUtil.close(connection)
hashMap
}
def saveOffset(offsetRanges: Array[OffsetRange]): Unit = {
val kafkaPra: HashMap[String, String] = getKafkaMap
val connection: Connection = JDBCUtil.getConnection
val sql = "replace into offset_manager(groupid,topic,`partition`,untiloffset) values(?,?,?,?)"
for (or <- offsetRanges) {
val preparedStatement: PreparedStatement = connection.prepareStatement(sql)
preparedStatement.setString(1, kafkaPra("group.id"))
preparedStatement.setString(2, or.topic)
preparedStatement.setInt(3, or.partition)
preparedStatement.setLong(4, or.untilOffset)
preparedStatement.executeUpdate()
preparedStatement.close()
}
JDBCUtil.close(connection)
}
}
4.使用工具类
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("UserQzController").setMaster("local[*]")
conf.set("spark.streaming.kafka.maxRatePerPartition", "100")
val ssc = new StreamingContext(conf, Seconds(5))
val kafkaDStream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getKafkaDStream(ssc, GlobalConstants.kafka_qz_log_topic)
val filterDStream: DStream[ConsumerRecord[String, String]] = kafkaDStream.filter(line => line.value().split("\t").length == 6)
val userQzLogDStream: DStream[UserQzLog] = filterDStream.map(line => {
val splits: Array[String] = line.value().split("\t")
UserQzLog(splits(0).toInt, splits(1).toInt, splits(2).toInt, splits(3), splits(4).toInt, splits(5))
})
UserQzService.statisticsDemand(userQzLogDStream)
kafkaDStream.foreachRDD(rdd => {
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
MyKafkaUtil.saveOffset(offsetRanges)
})
ssc.start()
ssc.awaitTermination()
}