package kafka4.utils
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import scala.collection.JavaConversions._
object MyKafkaUtils {
// 在zookeeper中, kafka的offset保存的根目录
val kakfaOffsetRootPath = "/consumers/offsets"
// 初始化Zookeeper客户端
val zkClient = {
val client = CuratorFrameworkFactory.builder.connectString("spark123:12181/kafka0.9").
retryPolicy(new ExponentialBackoffRetry(1000, 3)).namespace("mykafka").build()
client.start()
client
}
/**
* 判断zookeeper的路径是否存在, 如果不存在则创建
* @param path zookeeper的目录路径
*/
def ensureZKPathExists(path: String): Unit = {
if (zkClient.checkExists().forPath(path) == null) {
zkClient.create().creatingParentsIfNeeded().forPath(path)
}
}
def saveOffsets(offsetsRanges:Array[OffsetRange], groupName:String) = {
for (o <- offsetsRanges) {
val zkPath = s"${kakfaOffsetRootPath}/${groupName}/${o.topic}/${o.partition}"
ensureZKPathExists(zkPath)
zkClient.setData().forPath(zkPath,o.untilOffset.toString.getBytes())
}
}
// ${kakfaOffsetRootPath}/${groupName}/${topic}
def getZKOffsets(topicSet:Set[String], groupName:String, kafkaParam: Map[String, String]) : Map[TopicAndPartition, Long] = {
var offsets: Map[TopicAndPartition, Long] = Map()
val offGroupPath = kakfaOffsetRootPath + "/" + groupName
// 如果路径不存在, 则offset没有保存
if (zkClient.checkExists().forPath(offGroupPath) == null) {
return offsets
}
offsets = getResetOffsets(kafkaParam, topicSet )
for{
topic<-zkClient.getChildren.forPath(offGroupPath)
if (topicSet.contains(topic))
partition <- zkClient.getChildren.forPath(offGroupPath + "/" + topic)
}yield {
val partionPath = offGroupPath + "/" + topic + "/" + partition
val offset = zkClient.getData.forPath(partionPath) // if (zkClient.checkExists().forPath(partionPath) != null) zkClient.getData.forPath(partionPath) else "-1"
offsets += TopicAndPartition(topic, Integer.parseInt(partition)) -> java.lang.Long.valueOf(new String(offset)).toLong
}
offsets
}
/**
*
* @param kafkaParam
* @param topicSet
* @param groupName
* @return
*/
def getConSumerOffsets(kafkaParam: Map[String, String], topicSet:Set[String], groupName:String) : Map[TopicAndPartition, Long] = {
val brokers = kafkaParam("metadata.broker.list")
val kafkaSmallestParams = Map[String, String]("metadata.broker.list" -> brokers, "auto.offset.reset" -> "smallest")
val kafkaLargestParams = Map[String, String]("metadata.broker.list" -> brokers, "auto.offset.reset" -> "largest")
var offsets: Map[TopicAndPartition, Long] = Map()
val smallOffsets = getResetOffsets(kafkaSmallestParams, topicSet)
val largestOffsets = getResetOffsets(kafkaLargestParams, topicSet)
val consumerOffsets = getZKOffsets(topicSet, groupName, kafkaParam) // cOffset-从外部存储中读取的offset
smallOffsets.foreach({
case(tp, sOffset) => {
val cOffset = if (!consumerOffsets.containsKey(tp)) 0 else consumerOffsets(tp)
val lOffset = largestOffsets(tp)
if(sOffset > cOffset) {
offsets += tp->sOffset
} else if(cOffset > lOffset){
offsets += tp->lOffset
} else{
offsets += tp->cOffset
}
}
})
offsets
}
/**
* 获取smallest或者largest的offset
* @param kafkaParam
* @param topics topic集合, 多个topic使用逗号分隔
* @return
*/
def getResetOffsets(kafkaParam: Map[String, String], topics: Set[String]): Map[TopicAndPartition, Long] = {
//复制KafkaCluster
val cluster = new MyKafkaCluster(kafkaParam)
var offsets: Map[TopicAndPartition, Long] = Map()
// 最新或者最小offset reset为smallest或largest
val reset = kafkaParam.get("auto.offset.reset").map(x => x.toLowerCase())
val topicAndPartitions: Set[TopicAndPartition] = cluster.getPartitions(topics).right.get
if (reset == Some("smallest")) {
val leaderOffsets = cluster.getEarliestLeaderOffsets(topicAndPartitions).right.get
topicAndPartitions.foreach(tp => {
offsets += tp -> leaderOffsets(tp).offset
})
} else if (reset == Some("largest")) {
val leaderOffsets = cluster.getLatestLeaderOffsets(topicAndPartitions).right.get
topicAndPartitions.foreach(tp => {
offsets += tp -> leaderOffsets(tp).offset
})
}
offsets
}
def createMyDirectKafkaStream (ssc: StreamingContext,kafkaParams: Map[String, String], topics: Set[String], groupName: String
): InputDStream[(String, String)] = {
val fromOffsets = getConSumerOffsets(kafkaParams, topics, groupName)
println("fromOffsets==" + fromOffsets)
var kafkaStream : InputDStream[(String, String)] = null
val messageHandler = (mmd : MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
kafkaStream
}
def main(args: Array[String]): Unit = {
val brokers = "spark1234:9092"
val topic = "mykafka" //
val topicsSet = topic.split(",").toSet
// 获取topic中有效的最小offset
val kafkaParamsSmallest = Map[String, String]("metadata.broker.list" -> brokers, "auto.offset.reset" -> "smallest")
val smallestOffsets = getResetOffsets(kafkaParamsSmallest, topicsSet)
// 获取topic中有效的最新offset
val kafkaParamsLargest = Map[String, String]("metadata.broker.list" -> brokers, "auto.offset.reset" -> "largest")
val largestOffsets = getResetOffsets(kafkaParamsLargest, topicsSet)
// 打印
println("========Smallest offsets=============:" + smallestOffsets)
println("========Largest offsets=============:" + largestOffsets)
//println(getZKOffsets(Set("dd,mytest1"), "abc"))
// val topicsSet = topic.split(",").toSet
// val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers, "auto.offset.reset" -> "smallest")
// val ssc = new StreamingContext(sparkConf, Seconds(10))
// val groupName = "myspark"
// val messages = createMyDirectKafkaStream(ssc, kafkaParams, topicsSet, groupName)
//
// messages.foreachRDD((rdd,btime) => {
// if(!rdd.isEmpty()){
// rdd.map(x=>x._2).foreach(println)
// println("==========================:" + rdd.count() )
// println("==========================btime:" + btime )
// }
// saveOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, groupName)
// })
// ssc.start()
// ssc.awaitTermination()
}
}
producer
package kafka4
/**
* KafkaProducer的lazy加载
*/
import java.util.concurrent.Future
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
import scala.collection.JavaConversions._
/**
* 如果Class和Object同名, 那么称Class为Object的伴生类。
可以类比java的static修饰符。 因为scala中是没有static修饰符, 那么Object下的方法和成员变量等都是静态, 可以直接调用,而不需要创建对象。
简单的说:对于Class和Object下面都有apply方法, 如果调用Object下的Apply,使用类名()即可。
如果调用Class下的Apply, 那么需要先创建一个类的对象, 然后使用对象名()调用, 如val ab = new MyKafkaProducer() , 然后ab()就是调用
Class下的Apply方法。
*/
class MyKafkaProducer[K,V](createProducer:()=>KafkaProducer[K, V]) extends Serializable{
lazy val producer:KafkaProducer[K, V] = createProducer()
def send(topic: String, key: K, value: V): Future[RecordMetadata] =
producer.send(new ProducerRecord[K, V](topic, key, value))
def send(topic: String, value: V): Future[RecordMetadata] =
producer.send(new ProducerRecord[K, V](topic, value))
def apply() = {}
}
object MyKafkaProducer{
def apply[K, V](properties: java.util.Properties): MyKafkaProducer[K, V] = {
val config = properties.toMap
val createProducer = () => {
val producer = new KafkaProducer[K, V](config)
sys.addShutdownHook {
producer.close()
}
producer
}
new MyKafkaProducer(createProducer)
}
}
package kafka4
import java.util.Properties
import kafka4.utils.MyKafkaUtils._
import org.apache.kafka.common.serialization.StringSerializer
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.kafka.HasOffsetRanges
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.JavaConversions._
/**
* 将kafka中的数据消费后写入到kafka, 按照batch的方式。
* 使用广播变量 将kafka创建生产者广播到每个executor上面
*/
object Kafka2KafkaPerBatch {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("test").setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val sqlContext = new HiveContext(sc)
val processingInterval = 2
val brokers = "spark1234:9092"
val topic = "mytest1"
val topicsSet = topic.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers, "auto.offset.reset" -> "smallest")
val ssc = new StreamingContext(sc, Seconds(processingInterval))
val streaming = createMyDirectKafkaStream(ssc, kafkaParams, Set(topic), "testp2") // testp
val sinkTopic = "mykafka"
// 将KafkaProducer对象广播到所有的executor节点, 这样就可以在每个executor节点将数据插入到kafka
val kafkaProducer: Broadcast[MyKafkaProducer[String, String]] = {
val kafkaProducerConfig = {
val p = new Properties()
p.setProperty("bootstrap.servers", "spark1234:9092,spark1234:19092")
p.setProperty("key.serializer", classOf[StringSerializer].getName)
p.setProperty("value.serializer", classOf[StringSerializer].getName)
p
}
ssc.sparkContext.broadcast(MyKafkaProducer[String, String](kafkaProducerConfig))
}
streaming.foreachRDD(rdd => {
if(!rdd.isEmpty()){
rdd.map(x=>x._2).map(msg=>ParseUtils.parseMsg(msg)).filter(_.length!=1).foreach(msg=>{
kafkaProducer.value.send(sinkTopic, msg)
}
)
saveOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, "testp")
}})
ssc.start()
ssc.awaitTermination()
}
}