Spark Streaming消费kafka数据手动管理偏移量offset到zookeeper,保证精准一致消费,最终的处理结果保存在mysql中。
spark的版本是2.1
kafka的版本是0.8
代码:
/**
* @author lhq
* @date 2020/10/10 10:35
* @version 1.0
*/
import java.sql.{Connection, DriverManager, PreparedStatement}
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaCluster.Err
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaCluster, KafkaUtils}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable
/**
*
* kafka对接sparkstreaming,手动维护offset到zookeeper
*/
object KafkaOffset2Zk {
def main(args: Array[String]): Unit = {
//初始化ssc
val conf: SparkConf = new SparkConf().setAppName("").setMaster("local[*]")
val ssc: StreamingContext = new StreamingContext(conf, Seconds(3))
ssc.sparkContext.setLogLevel("ERROR")
//kafka参数
val brokers = "192.*.*.*:9092,192.*.*.*:9092,192.*.*.*:9092"
val topic = "qq_topic"
val group = "bigdata"
val deserialization = "org.apache.kafka.common.serialization.StringDeserializer"
val kafkaParams = Map(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
ConsumerConfig.GROUP_ID_CONFIG -> group,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> deserialization,
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> deserialization)
//创建KafkaCluster对象,维护offset
val cluster = new KafkaCluster(kafkaParams)
//获取初始偏移量
val fromOffset: Map[TopicAndPartition, Long] = getOffset(cluster, group, topic)
//创建流
val kafkaStream: InputDStream[String] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, String](ssc, kafkaParams, fromOffset, (mess: MessageAndMetadata[String, String]) => mess.message())
//转换逻辑 wordcount
val dataRDD=kafkaStream.map((_, 1)).reduceByKey(_ + _)
dataRDD.print()
// 将数据保存在mysql数据库
var conn: Connection = null;
var ps: PreparedStatement = null;
try {
dataRDD.foreachRDD(cs => {
Class.forName("com.mysql.jdbc.Driver").newInstance();
cs.foreachPartition(f => {
conn = DriverManager.getConnection("jdbc:mysql://127.0.0.1:3306/test?useUnicode=true&characterEncoding=utf8", "root", "11111");
conn.setAutoCommit(false)
ps = conn.prepareStatement("insert into resultkafka values(?,?)");
f.foreach(s => {
ps.setString(1, s._1);
ps.setInt(2, s._2);
ps.executeUpdate();
})
conn.commit()
})
})
//更新偏移量
setOffset(cluster, kafkaStream, group);
} catch {
case t: Throwable => t.printStackTrace()
conn.rollback()//发生异常,事务回滚
}finally {
if (ps != null) {
ps.close()
}
if (conn != null) {
conn.close();
}
}
ssc.start()
ssc.awaitTermination()
}
/**
* 获取偏移量
*
* @param cluster
* @param group
* @param topic
* @return
*/
def getOffset(cluster: KafkaCluster, group: String, topic: String) = {
var partitionToLong = new mutable.HashMap[TopicAndPartition, Long]()
//获取所有主题的分区
val topicAndPartition: Either[Err, Set[TopicAndPartition]] = cluster.getPartitions(Set(topic))
val partitions: Set[TopicAndPartition] = topicAndPartition.right.get
//获取偏移量信息
val offsetInfo: Either[Err, Map[TopicAndPartition, Long]] = cluster.getConsumerOffsets(group, partitions)
if (offsetInfo.isRight) {
// 如果有offset信息则存储offset
val offsets: Map[TopicAndPartition, Long] = offsetInfo.right.get
for (offset <- offsets) {
partitionToLong += offset
}
} else {
//如果没有则设置为0
for (p <- partitions) {
partitionToLong += (p -> 0L)
}
}
partitionToLong.toMap
}
/**
* 提交偏移量
*
* @param cluster
* @param kafkaStream
* @param group
*/
def setOffset(cluster: KafkaCluster, kafkaStream: InputDStream[String], group: String): Unit = {
kafkaStream.foreachRDD { rdd =>
val offsetRangeArray = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
for (offset <- offsetRangeArray) {
val ack: Either[Err, Map[TopicAndPartition, Short]] = cluster.setConsumerOffsets(group, Map(offset.topicAndPartition() -> offset.untilOffset))
if (ack.isRight) {
println(s"成功更新了消费kafka的偏移量:${offset.untilOffset}")
} else {
println(s"失败更新消费kafka的偏移量:${ack.left.get}")
}
}
}
}
}
pom.xml
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-8 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>2.1.1</version>
</dependency>
---下面注释的应该用不到
#<dependency>
# <groupId>org.apache.spark</groupId>
# <artifactId>spark-sql_2.11</artifactId>
# <version>2.3.0</version>
#</dependency>
#<dependency>
# <groupId>org.apache.spark</groupId>
# <artifactId>spark-hive_2.11</artifactId>
# <version>2.3.0</version>
#</dependency>
#<dependency>
# <groupId>org.apache.spark</groupId>
# <artifactId>spark-core_2.11</artifactId>
# <version>2.3.0</version>
#</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.33</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>net.jpountz.lz4</groupId>
<artifactId>lz4</artifactId>
<version>1.3.0</version>
</dependency>
<!--最后添加的-->
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.0</version>
</dependency>
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty-all</artifactId>
<version>4.1.18.Final</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.10.0</version>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
<version>3.1.0</version>
</dependency>
</dependencies>