/**
* @Author: 唐
* @Date: 2020/3/25 22:32*/
/**
* @Author: 唐
* @Date: 2020/3/25 20:12*/import java.sql.{DriverManager, PreparedStatement}
import kafka.api.{OffsetRequest, PartitionOffsetRequestInfo, TopicMetadataRequest}
import kafka.common.TopicAndPartition
import kafka.consumer.SimpleConsumer
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Durations, Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
import scala.util.{Success, Try}objecttest01 {
def main(args: Array[String]): Unit={
val conf= newSparkConf()
.setMaster("local[*]")
.setAppName("Chapter8_4_5")
val sc= newSparkContext(conf)
val ssc= new StreamingContext(sc, Durations.seconds(10))
val topics= Set("spark_streaming_test")
val kafkaParams=mutable.Map[String, String]()
kafkaParams.put("bootstrap.servers", "min01:9092,min02:9092,min03:9092") --kafka 集群信息
kafkaParams.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") --反序列化 key (因为sparkstreaming要把数据从kafka里读出来)
kafkaParams.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")--反序列化 value
kafkaParams.put("session.timeout.ms", "30000")
kafkaParams.put("enable.auto.commit", "false")
kafkaParams.put("max.poll.records", "100")
kafkaParams.put("kafka.topics", "spark_streaming_test") --kafka topic
kafkaParams.put("group.id", "g_spark_test") --消费者所属消费组
val zkHost = "min01:2181,min02:2181,min03:2181"
val sessionTimeout = 120000
val connectionTimeout = 60000
val zkClient = ZKUtil.initZKClient(zkHost, sessionTimeout, connectionTimeout) --初始化 zookeeper 所需要的参数
val zkTopic = "spark_streaming_test" --
val zkConsumerGroupId = "g_spark_test" --kafka有多少分区,zookeeper里面就会有多少文件,言外之意,就是为kafka里面的每个partition单独有一个zk文件用来保存offset偏移量
val zkTopicDir = new ZKGroupTopicDirs(zkConsumerGroupId, zkTopic) --这行连着下行作用是得到偏移量信息的存储目录
val zkTopicPath = zkTopicDir.consumerOffsetDir
val childrenCount = zkClient.countChildren(zkTopicPath) --得到偏移量目录下有多少文件
var kafkaStream: InputDStream[(String, String)] = null --会把kafka的数据流转化为 Dstream
var fromOffsets: Map[TopicAndPartition, Long] = Map() --用于确定本次消费的消息从何处开始
kafkaStream = if (childrenCount > 0) { --如果对应zk目录下没有存在偏移量文件则根据KafkaUtils.createDirectStream方法创建Dstream
val req = new TopicMetadataRequest(topics.toList, 0) --通过向kafka发送一个TopicMetadataRequest实例,得到kafka指定主题各个分区的状态
val leaderConsumer = new SimpleConsumer("min01", 9092, 10000, 10000, "StreamingOffsetObserver")
val res = leaderConsumer.send(req)
val topicMetaOption = res.topicsMetadata.headOption
val partitions = topicMetaOption match {
case Some(tm) =>
tm.partitionsMetadata.map(pm => (pm.partitionId, pm.leader.get.host)).toMap[Int, String]
case None =>
Map[Int, String]()
}
for (partition
val partitionOffset = zkClient.readData[String](zkTopicPath + "/" + partition)
val tp = TopicAndPartition(kafkaParams("kafka.topics"), partition)
val requestMin = OffsetRequest(Map(tp -> PartitionOffsetRequestInfo(OffsetRequest.EarliestTime, 1)))
val consumerMin = new SimpleConsumer(partitions(partition), 9092, 10000, 10000, "getMinOffset")
val curOffsets = consumerMin.getOffsetsBefore(requestMin).partitionErrorAndOffsets(tp).offsets
var nextOffset = partitionOffset.toLong
if (curOffsets.nonEmpty && nextOffset < curOffsets.head) {
nextOffset = curOffsets.head
}
fromOffsets += (tp -> nextOffset)
}
val messageHandler = (mam: MessageAndMetadata[String, String]) => (mam.key, mam.message)
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams.toMap, fromOffsets, messageHandler)
} else {
KafkaUtils.createDirectStream[
String,
String,
StringDecoder,
StringDecoder](ssc, kafkaParams.toMap, topics)
}
var offsetRanges: Array[OffsetRange] = null
val kafkaInputDStream = kafkaStream.transform { rdd => {
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
}
}
val kafkaValues = kafkaInputDStream.map(_._2)
val kafkaSplits = kafkaValues.map(_.split(",")).filter(_.length == 4)
val results =kafkaSplits.map(_.mkString(","))
results.foreachRDD(rdd => {
//在Driver端执行
rdd.foreachPartition(p => {
//在Worker端执行
//如果将输出结果保存到某个数据库,可在此处实例化数据库的连接器
p.foreach(result => {
val car = result.split(",")(0)
val longitude = result.split(",")(1)
val latitude = result.split(",")(2)
val timestamp = result.split(",")(3)
val conn=ConnectionPool.getConnection()
// val sql = "INSERT INTO syllabus.t_car_position (plate_num,longitude,latitude,timestamp ) values (?,?,?,? )"
// val sql = "INSERT INTO syllabus.people (id,name,area,sex ) values (?,?,?,? )"
// val sql = "INSERT INTO syllabus.keshi(time01,name01,count01,sign01 ) values (?,?,?,? )" --问号是指占位符,在sql创建的时候未被指定,通过与PreparedStatemen接口整合来传递数据,(PreparedStatemen要使用setstring方法来为占位符提供数据)
// val sql = "INSERT INTO syllabus.area(areaid,name,jing,wei) values (?,?,?,? )"
//
// val statement: PreparedStatement = conn.prepareStatement(sql) --接口继承Statement,里面包含已编译的语句
// statement.setString(1,car)
// statement.setString(2,longitude)
// statement.setString(3,latitude)
// statement.setString(4,timestamp)
//
// statement.addBatch() --批处理调交
// statement.executeBatch() --批处理更新,通常要关闭sql自动提交,防止JDBC进行事务处理。(好处是在发生事务处理异常的时候由操作者决定要不要进行事务处理)
conn.commit()
ConnectionPool.returnConnection(conn)
println(result)
})
})
//ZkUtils不可序列化,所以需要在Driver端执行
for (o
ZkUtils.updatePersistentPath(zkClient, zkTopicDir.consumerOffsetDir + "/" + {
o.partition
}, o.fromOffset.toString)
println("本次消息消费成功后,偏移量状态:" + o)
}
})
ssc.start()
ssc.awaitTermination()
}
}
注: pom信息